10a694ea43
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback. This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn. Also, fully implement the "copy" function for the orte_job_t object. NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value. This commit was SVN r21200.
58 строки
1.7 KiB
C
58 строки
1.7 KiB
C
/* -*- C -*-
|
|
*
|
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*
|
|
*/
|
|
#ifndef ORTE_ERRMGR_HNP_H
|
|
#define ORTE_ERRMGR_HNP_H
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/types.h"
|
|
|
|
#include "orte/mca/plm/plm_types.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/*
|
|
* Module open / close
|
|
*/
|
|
int orte_errmgr_default_component_open(void);
|
|
int orte_errmgr_default_component_close(void);
|
|
int orte_errmgr_default_component_query(mca_base_module_t **module, int *priority);
|
|
|
|
|
|
/*
|
|
* Component API functions
|
|
*/
|
|
void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code);
|
|
|
|
void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code);
|
|
|
|
int orte_errmgr_default_register_callback(orte_jobid_t job,
|
|
orte_job_state_t state,
|
|
orte_err_cb_fn_t cbfunc,
|
|
void *cbdata);
|
|
|
|
ORTE_MODULE_DECLSPEC extern mca_errmgr_base_component_t mca_errmgr_default_component;
|
|
|
|
END_C_DECLS
|
|
|
|
#endif
|