1
1
openmpi/orte/mca/errmgr/hnp/errmgr_hnp.c
Ralph Castain 19767802de Let the errmgr know how to deal with incomplete starts
This commit was SVN r14495.
2007-04-24 19:04:29 +00:00

253 строки
8.4 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <stdlib.h>
#include <stdarg.h>
#include "opal/class/opal_list.h"
#include "opal/util/trace.h"
#include "opal/util/output.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/params.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/pls/pls.h"
#include "orte/mca/smr/smr.h"
#include "orte/mca/schema/schema.h"
#include "orte/dss/dss.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/hnp/errmgr_hnp.h"
/*
* This function gets called when the someone updates a process
* state to indicate it has aborted. That action results in
* the firing of a registry trigger that passes a minimal
* data message here. The only part of that message we need
* is the segment name so we can extract the jobid from it
*
* Various components will follow their own strategy for dealing with
* this situation. For this component, we simply kill the job.
*/
int orte_errmgr_hnp_proc_aborted(orte_gpr_notify_message_t *msg)
{
orte_jobid_t job;
opal_list_t attrs;
opal_list_item_t *item;
int rc;
OPAL_TRACE(1);
opal_output(orte_errmgr_base_output, "errmgr:hnp: proc abort has been detected");
/* This trigger is named, so we can extract the jobid
* directly from the trigger name
*/
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* set the job state */
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_ABORTED))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* tell the pls to terminate the ENTIRE FAMLIY of this job - this is necessary to avoid
* "hanging" portions of the application if the aborted job was dynamically spawned
* from another job
*/
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_JOB_FAMILY, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, &attrs))) {
ORTE_ERROR_LOG(rc);
}
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
/* orterun will only wakeup when all procs IN THE ROOT JOB report terminated. The terminate_job
* function *should* have done that - however, it is possible during abnormal
* startup that it will fail to happen. If we get here, we force the issue by
* deliberately causing the TERMINATE trigger to fire
*/
if (ORTE_SUCCESS != (rc = orte_wakeup(job))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/*
* This function gets called when someone updates a process
* state to indicate it failed to start. That action results in
* the firing of a registry trigger that passes a minimal
* data message here. The only part of that message we need
* is the segment name so we can extract the jobid from it
*
* Various components will follow their own strategy for dealing with
* this situation. For this component, we simply kill the job.
*/
int orte_errmgr_hnp_incomplete_start(orte_gpr_notify_message_t *msg)
{
orte_jobid_t job;
opal_list_t attrs;
opal_list_item_t *item;
int rc;
OPAL_TRACE(1);
/* This trigger is named, so we can extract the jobid
* directly from the trigger name
*/
if (ORTE_SUCCESS != (rc = orte_schema.extract_jobid_from_std_trigger_name(&job, msg->target))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output(orte_errmgr_base_output, "errmgr_hnp: incomplete start reported - job %lu", (unsigned long)job);
/* set the job state */
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* tell the pls to terminate the job - kill this job and all members of its family
* as we have no way to handle it otherwise at this time
*/
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_USE_JOB_FAMILY, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
if (ORTE_SUCCESS != (rc = orte_pls.terminate_job(job, &orte_abort_timeout, &attrs))) {
ORTE_ERROR_LOG(rc);
}
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attrs);
/* orterun will only wakeup when all procs IN THE ROOT JOB report terminated. The terminate_job
* function *should* have done that - however, it is possible during abnormal
* startup that it will fail to happen. If we get here, we force the issue by
* deliberately causing the TERMINATE trigger to fire
*/
if (ORTE_SUCCESS != (rc = orte_wakeup(job))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/*
* This function gets called when the HNP itself detects an internal error!
* Ideally, we would find some way to tell all the active jobs to die before
* we depart ourselves. Unfortunately, at this time, we aren't sure we can do
* this - later, we'll add some more intelligence by, for example, checking
* the error code to see if it's something that would allow us to alert
* the remote orteds.
*
* For now, we'll just depart!
*/
void orte_errmgr_hnp_error_detected(int error_code, char *fmt, ...)
{
va_list arglist;
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, buffer );
free( buffer );
}
va_end(arglist);
/* abnormal exit */
orte_abort(error_code, false);
}
/*
* This function gets called when the HNP desperately needs to just die.
* Nothing can be done by definition here - this function ONLY gets
* called as an absolute last resort
*/
void orte_errmgr_hnp_abort(void)
{
OPAL_TRACE(1);
/* abnormal exit */
orte_abort(-1, false);
}
/*
* This function gets called when a process wants to request that the HNP
* abort some set of processes for it. Since this component IS for the HNP,
* that means we need to actually execute this request! Call upon the PLS
* as needed to execute the abort requests
*/
int orte_errmgr_hnp_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
{
int rc;
OPAL_TRACE(1);
rc = ORTE_SUCCESS;
return rc;
}
/*
* Register the HNP's errmgr functions to be called when the job encounters
* certain pre-identified problem states.
*
* NOTE: It is imperative that ONLY the HNP perform this registration!
*/
int orte_errmgr_hnp_register_job(orte_jobid_t job)
{
/* we need to setup two counters and their corresponding triggers - one
* to alert us when something fails to launch, and another for when
* someone aborts
*/
int rc;
OPAL_TRACE(1);
/* define the ABORT trigger to fire when any process aborts */
if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_NUM_ABORTED_TRIGGER,
ORTE_PROC_NUM_ABORTED, 0, 1, true,
orte_errmgr_hnp_proc_aborted, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the FAILED_LAUNCH trigger to fire when the launch fails */
if (ORTE_SUCCESS != (rc = orte_smr.define_alert_monitor(job, ORTE_FAILED_TO_START_TRIGGER,
ORTE_PROC_NUM_FAILED_START, 0, 1, true,
orte_errmgr_hnp_incomplete_start, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}