9613b3176c
After much work by Jeff and myself, and quite a lot of discussion, it has become clear that we simply cannot resolve the infinite loops caused by RML-involved subsystems calling orte_output. The original rationale for the change to orte_output has also been reduced by shifting the output of XML-formatted vs human readable messages to an alternative approach. I have globally replaced the orte_output/ORTE_OUTPUT calls in the code base, as well as the corresponding .h file name. I have test compiled and run this on the various environments within my reach, so hopefully this will prove minimally disruptive. This commit was SVN r18619.
162 строки
5.5 KiB
C
162 строки
5.5 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#include <stdlib.h>
|
|
#include <stdarg.h>
|
|
|
|
#include "opal/class/opal_list.h"
|
|
#include "opal/util/trace.h"
|
|
#include "orte/util/show_help.h"
|
|
|
|
#include "orte/runtime/runtime.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/runtime/orte_wakeup.h"
|
|
#include "orte/runtime/orte_locks.h"
|
|
#include "orte/mca/plm/plm.h"
|
|
#include "orte/util/session_dir.h"
|
|
#include "orte/util/name_fns.h"
|
|
|
|
#include "orte/mca/errmgr/base/errmgr_private.h"
|
|
#include "errmgr_default.h"
|
|
|
|
/*
|
|
* This function gets called by the PLM when an orted notifies us
|
|
* that a process has aborted
|
|
* Various components will follow their own strategy for dealing with
|
|
* this situation. For this component, we simply kill the job.
|
|
*/
|
|
void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
|
|
{
|
|
int rc;
|
|
orte_job_t **jobs;
|
|
orte_std_cntr_t i;
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
/* if we are already in progress, then ignore this call */
|
|
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
|
"%s errmgr:default: abort in progress, ignoring proc %s aborted with status %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(name), exit_code));
|
|
|
|
return;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
|
"%s errmgr:default: proc %s aborting with status %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(name), exit_code));
|
|
|
|
/* indicate that all jobs other than the one containing this
|
|
* proc have been orted to abort - this is necessary to avoid
|
|
* duplicate ordering of "abort".
|
|
*
|
|
* NOTE: be sure to not include the 0 job data location as this
|
|
* contains the daemons!
|
|
*/
|
|
jobs = (orte_job_t**)orte_job_data->addr;
|
|
for (i=1; i < orte_job_data->size; i++) {
|
|
/* the array is left justfied, so we can quit once
|
|
* we see a NULL
|
|
*/
|
|
if (NULL == jobs[i]) {
|
|
break;
|
|
}
|
|
if (ORTE_JOB_STATE_ABORTED != jobs[i]->state &&
|
|
ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state &&
|
|
ORTE_JOB_STATE_ABORTED_WO_SYNC != jobs[i]->state) {
|
|
jobs[i]->state = ORTE_JOB_STATE_ABORT_ORDERED;
|
|
}
|
|
}
|
|
|
|
/* tell the plm to terminate all jobs */
|
|
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
|
|
/* set the exit status, just in case whomever called us failed
|
|
* to do so - it can only be done once, so we are protected
|
|
* from overwriting it
|
|
*/
|
|
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
|
|
|
/* wakeup orterun so we can exit */
|
|
if (ORTE_SUCCESS != (rc = orte_wakeup())) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This function gets called by the PLM when an orted notifies us that
|
|
* a job failed to start.
|
|
* Various components will follow their own strategy for dealing with
|
|
* this situation. For this component, we simply kill the job.
|
|
*/
|
|
void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code)
|
|
{
|
|
int rc;
|
|
|
|
OPAL_TRACE(1);
|
|
|
|
/* if we are already in progress, then ignore this call */
|
|
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
|
"%s errmgr:default: abort in progress, ignoring incomplete start on job %s with status %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(job), exit_code));
|
|
return;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
|
"%s errmgr:default: job %s reported incomplete start with status %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(job), exit_code));
|
|
|
|
/* tell the plm to terminate all jobs */
|
|
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
|
|
/* set the exit status, just in case whomever called us failed
|
|
* to do so - it can only be done once, so we are protected
|
|
* from overwriting it
|
|
*/
|
|
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
|
|
|
/* wakeup orterun so we can exit */
|
|
if (ORTE_SUCCESS != (rc = orte_wakeup())) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Register a callback function upon a change to a specified job state.
|
|
*/
|
|
int orte_errmgr_default_register_callback(orte_jobid_t job,
|
|
orte_job_state_t state,
|
|
orte_errmgr_cb_fn_t cbfunc,
|
|
void *cbdata)
|
|
{
|
|
return ORTE_ERR_NOT_IMPLEMENTED;
|
|
}
|