1
1
openmpi/orte/mca/errmgr/bproc/errmgr_bproc.c
Ralph Castain ae79894bad Bring the map fixes into the main trunk. This should fix several problems, including the multiple app_context issue.
I have tested on rsh, slurm, bproc, and tm. Bproc continues to have a problem (will be asking for help there).

Gridengine compiles but I cannot test (believe it likely will run).

Poe and xgrid compile to the extent they can without the proper include files.

This commit was SVN r12059.
2006-10-07 15:45:24 +00:00

301 строка
8.8 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/runtime/runtime.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "orte/mca/errmgr/bproc/errmgr_bproc.h"
/*
* This function gets called when the SMR updates a process state to
* indicate that it aborted. Since the bproc component is only active on
* non-HNP processes, this function will NEVER be called
*/
int orte_errmgr_bproc_proc_aborted(orte_gpr_notify_message_t *msg)
{
OPAL_TRACE(1);
return ORTE_ERR_NOT_AVAILABLE;
}
/*
* This function gets called when the SMR updates a process state to
* indicate that it failed to start. Since the bproc component is only active on
* non-HNP processes, this function will NEVER be called
*/
int orte_errmgr_bproc_incomplete_start(orte_gpr_notify_message_t *msg)
{
OPAL_TRACE(1);
return ORTE_ERR_NOT_AVAILABLE;
}
/*
* This function gets called when a process detects an internal error.
* Bproc is unusually bad about letting us pass information that we
* aborted as opposed to normally terminated. There is no way to locally
* monitor the process state on a remote node, so the only thing we
* can do is pass the info back to the Bproc PLS on the HNP and let it
* figure out what to do.
*/
void orte_errmgr_bproc_error_detected(int error_code, char *fmt, ...)
{
va_list arglist;
orte_buffer_t* cmd;
uint8_t command;
int rc;
OPAL_TRACE(1);
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, buffer );
free( buffer );
}
va_end(arglist);
/* Now prepare and send a message to the BProc PLS so it knows that
* we abnormally terminated. It doesn't matter what is in the
* message - the fact that it gets received is adequate
*/
command = 0x01;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
/* just pack something */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_UINT8))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return;
}
/* send the alert */
if (0 > orte_rml.send_buffer(orte_errmgr_bproc_globals.replica, cmd, ORTE_RML_TAG_BPROC_ABORT, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return;
}
OBJ_RELEASE(cmd);
/* okay, now we can truly abort. Tell the abort function not to bother writing out
* an abort file - we can't do anything with it anyway!
*/
orte_abort(error_code, false);
}
/*
* This function gets called when a process desperately needs to just die.
* Nothing can be done by definition here - this function ONLY gets
* called as an absolute last resort.
*/
void orte_errmgr_bproc_abort()
{
/* abnormal exit - no point in writing out an abort file as bproc doesn't
* know what to do with it anyway
*/
orte_abort(-1, false);
}
/*
* Alternatively, some systems (e.g., OpenMPI) need to tell us to kill
* some other subset of processes along with us. Send that info to the
* HNP so it can kill them.
*
* NOTE: this function assumes that the underlying ORTE infrastructure is
* still operational. Use of this function should therefore be restricted
* to cases where the problem is in a higher layer (e.g., MPI) as the
* process is likely to "hang" if an ORTE problem has been encountered.
*/
int orte_errmgr_bproc_abort_procs_request(orte_process_name_t *procs, orte_std_cntr_t nprocs)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_errmgr_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
/* protect us against error */
if (NULL == procs) {
return ORTE_ERR_BAD_PARAM;
}
command = ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the number of procs we are requesting be aborted */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &nprocs, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the array of proc names */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, procs, nprocs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* send the request */
if (0 > orte_rml.send_buffer(orte_errmgr_bproc_globals.replica, cmd, ORTE_RML_TAG_ERRMGR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
/* setup a buffer for the answer */
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* enter a blocking receive until we hear back */
if (0 > orte_rml.recv_buffer(orte_errmgr_bproc_globals.replica, answer, ORTE_RML_TAG_ERRMGR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* check that this is the right command */
if (ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
/* clean up and leave */
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
/*
* It is imperative that ONLY an HNP perform this registration!
*/
int orte_errmgr_bproc_register_job(orte_jobid_t job)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_errmgr_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
command = ORTE_ERRMGR_REGISTER_JOB_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* pack the command */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* pack the jobid we are requesting be monitored */
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
/* send the request */
if (0 > orte_rml.send_buffer(orte_errmgr_bproc_globals.replica, cmd, ORTE_RML_TAG_ERRMGR, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
/* setup a buffer for the answer */
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* enter a blocking receive until we hear back */
if (0 > orte_rml.recv_buffer(orte_errmgr_bproc_globals.replica, answer, ORTE_RML_TAG_ERRMGR)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_ERRMGR_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* check that this is the right command */
if (ORTE_ERRMGR_REGISTER_JOB_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
/* clean up and leave */
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}