10a694ea43
Unfortunately, we assign the jobid during the plm.spawn procedure - which means it happens -after- control of the job has passed out of the range of mpirun (or whatever program is spawning the job), so it is too late for that main program to register a callback function. If the main program registers tha callback -after- we return from plm.spawn, then it (a) cannot get a callback for failed-to-start, and (b) will miss the callback if a proc aborts in the time between job launch and the call to errmgr.register_callback. This commit fixes the problem by adding callback-related fields to the orte_job_t object. Thus, the main program can specify what job states should initiate a callback, what function is to be called, and what data is to be passed back by simply filling in the orte_job_t fields prior to calling plm.spawn. Also, fully implement the "copy" function for the orte_job_t object. NOTE: as a result of this change, the errmgr.register_callback API may no longer be of any value. This commit was SVN r21200.
92 строки
2.5 KiB
C
92 строки
2.5 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif
|
|
#include <stdlib.h>
|
|
|
|
#include "opal/util/trace.h"
|
|
#include "opal/util/output.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/session_dir.h"
|
|
#include "orte/mca/ess/ess.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/errmgr/base/errmgr_private.h"
|
|
|
|
|
|
void orte_errmgr_base_log(int error_code, char *filename, int line)
|
|
{
|
|
OPAL_TRACE(1);
|
|
|
|
if (ORTE_ERR_SILENT == error_code) {
|
|
/* if the error is silent, say nothing */
|
|
return;
|
|
}
|
|
|
|
opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_ERROR_NAME(error_code), filename, line);
|
|
}
|
|
|
|
void orte_errmgr_base_proc_aborted_not_avail(orte_process_name_t *name, int exit_code)
|
|
{
|
|
return;
|
|
}
|
|
|
|
void orte_errmgr_base_incomplete_start_not_avail(orte_jobid_t job, int exit_code)
|
|
{
|
|
return;
|
|
}
|
|
|
|
void orte_errmgr_base_error_abort(int error_code, char *fmt, ...)
|
|
{
|
|
va_list arglist;
|
|
|
|
/* If there was a message, output it */
|
|
va_start(arglist, fmt);
|
|
if( NULL != fmt ) {
|
|
char* buffer = NULL;
|
|
vasprintf( &buffer, fmt, arglist );
|
|
opal_output( 0, buffer );
|
|
free( buffer );
|
|
}
|
|
va_end(arglist);
|
|
|
|
/* cleanup my session directory */
|
|
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
|
|
|
|
/* abnormal exit */
|
|
orte_ess.abort(error_code, false);
|
|
}
|
|
|
|
int orte_errmgr_base_register_cb_not_avail(orte_jobid_t job,
|
|
orte_job_state_t state,
|
|
orte_err_cb_fn_t cbfunc,
|
|
void *cbdata)
|
|
{
|
|
return ORTE_ERR_NOT_AVAILABLE;
|
|
}
|