1
1

Revamp the errmgr framework to provide a greater range of optional behaviors, including different behaviors for daemons, and remove several looping messages across the code base:

* add hnp and orted modules to the errmgr framework. The HNP module contains much of the code that was in the errmgr base since that code could only be executed by the HNP anyway.

* update the odls to report process states directly into the active errmgr module, thus removing the need to send messages looped back into the odls cmd processor. Let the active errmgr module decide what to do at various states.

* remove the code to track application state progress from the plm_base_launch_support.c code. Update the plm modules to call the errmgr directly when a launch fails.

* update the plm_base_receive.c code to call the errmgr with state updates from remote daemons

* update the routed modules to reflect that process state is updated in the errmgr

* ensure that the orted's open the errmgr and select their appropriate module

* add new pretty-print utilities to print process and job state. Move the pretty-print of time info to a globally-accessible place

* define a global orte_comm function to send messages from orted's to the HNP so that others can overlay the standard RML methods, if desired.

* update the orterun help output to reflect that the "term w/o sync" error message can result from three, not two, scenarios

This commit was SVN r23023.
Этот коммит содержится в:
Ralph Castain 2010-04-23 04:44:41 +00:00
родитель f711c4713f
Коммит efbb5c9b7c
70 изменённых файлов: 2626 добавлений и 2315 удалений

Просмотреть файл

@ -105,9 +105,6 @@ enum {
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
/* include the prototype for the error-to-string converter */
ORTE_DECLSPEC const char* orte_err2str(int errnum);
END_C_DECLS
#endif /* ORTE_CONSTANTS_H */

Просмотреть файл

@ -46,11 +46,11 @@ ORTE_DECLSPEC int orte_errmgr_base_close(void);
* Composite Stack states
*/
#define ORTE_ERRMGR_STACK_STATE_NONE 0x00 /* No actions have been performed */
#define ORTE_ERRMGR_STACK_STATE_STABLIZED 0x01 /* Stabalized the runtime */
#define ORTE_ERRMGR_STACK_STATE_UPDATED 0x01 /* Updated the runtime */
#define ORTE_ERRMGR_STACK_STATE_CONTINUE 0x02 /* Continue running without this process */
#define ORTE_ERRMGR_STACK_STATE_RECOVERED 0x04 /* Process has been recovered */
#define ORTE_ERRMGR_STACK_STATE_JOB_ABORT 0x08 /* Abort this job, cannot recover */
#define ORTE_ERRMGR_STACK_STATE_COMPLETE 0x10 /* done processing this command */
/**
* Output and component variables
*/

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,32 +27,20 @@
#include <stdlib.h>
#include <stdarg.h>
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_locks.h"
#include "opal/util/trace.h"
#include "opal/util/output.h"
#include "orte/util/name_fns.h"
#include "orte/util/session_dir.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/routed/routed.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
/*
* Local Function Declaration
*/
static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state);
/*
* Public interfaces
*/
@ -69,359 +58,49 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
ORTE_ERROR_NAME(error_code), filename, line);
}
int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code)
int orte_errmgr_base_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *name,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
int rc;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
orte_proc_state_t state = ORTE_PROC_STATE_ABORTED;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
orte_errmgr_base_module_t *module = NULL;
orte_errmgr_stack_state_t stack_state;
orte_errmgr_base_module_t *module;
if( ORTE_PROC_IS_APP ) {
return ORTE_SUCCESS;
}
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Stabalize the runtime
********************************/
if( !orte_errmgr_base_shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- %s fault reported! Process %s",
"errmgr:base:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
ORTE_NAME_PRINT(name)));
}
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
/* nothing we can do - abort things */
goto PROCESS;
(NULL == name) ? "App. Process" : (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
(NULL == name) ? "NULL" : ORTE_NAME_PRINT(name)));
}
/* if the proc was terminated by cmd, ignore it */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) {
/* nothing we can do */
goto PROCESS;
}
if( !orte_errmgr_base_shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- %s fault reported! Process %s, state (0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"),
ORTE_NAME_PRINT(name),
proc->state ));
}
if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
/* don't do anything or else we can enter an infinite loop */
return ORTE_SUCCESS;
}
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) {
goto PROCESS;
}
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Call the active modules
********************************/
if( orte_errmgr_base_enable_recovery && !orte_errmgr_base_shutting_down) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- Attempting recovery... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->process_fault ) {
module->process_fault(jdata, name, state, &stack_state);
}
}
}
/********************************
* If the active modules still need us to abort, then do so
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- Successfully recovered from process %s fault! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name)));
return ORTE_SUCCESS;
}
PROCESS:
if( !orte_errmgr_base_shutting_down ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:proc_aborted() %s) "
"------- Not able to recover from process %s fault! Aborting...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name)));
}
/* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: abort in progress, ignoring proc %s aborted with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), exit_code));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: proc %s aborted with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name), exit_code));
orte_job_term_ordered = true;
/* if the proc is a daemon, then we are abnormally terminating */
if (ORTE_PROC_MY_NAME->jobid == name->jobid) {
orte_abnormal_term_ordered = true;
}
/* indicate that all jobs other than the one containing this
* proc have been ordered to abort - this is necessary to avoid
* duplicate ordering of "abort".
*
* NOTE: be sure to not include the 0 job data location as this
* contains the daemons!
*/
for (i=1; i < orte_job_data->size; i++) {
/* the array may have holes in it as we are recovering
* jobids as they complete, so check everything
*/
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
for (i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if (ORTE_JOB_STATE_ABORTED != jdata->state &&
ORTE_JOB_STATE_ABORTED_BY_SIG != jdata->state &&
ORTE_JOB_STATE_ABORTED_WO_SYNC != jdata->state) {
jdata->state = ORTE_JOB_STATE_ABORT_ORDERED;
}
}
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* just return - let the daemons report back so we can properly
* know when to actually exit
*/
return ORTE_SUCCESS;
}
int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code)
{
int rc;
orte_job_t *jdata;
orte_proc_state_t state = ORTE_PROC_STATE_FAILED_TO_START;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
if( ORTE_PROC_IS_APP ) {
return ORTE_SUCCESS;
}
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Stabalize the runtime
********************************/
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Incomplete start of job %s!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* nothing we can do - abort things */
goto PROCESS;
}
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, NULL, state)) {
goto PROCESS;
}
/********************************
* Call the active modules
* JJH: Currently, if we cannot launch the job, then we should just abort.
* JJH: Add job launch recovery logic...
********************************/
#if 0
if( orte_errmgr_base_enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Attempting recovery... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->process_fault ) {
module->process_fault(jdata, NULL, state, &stack_state);
if( NULL != module->update_state ) {
rc = module->update_state(job, jobstate, name, state, exit_code, &stack_state);
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
break;
}
}
}
#endif
/********************************
* If the active modules still need us to abort, then do so
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Successfully recovered from incomplete start of job %s! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job) ));
return ORTE_SUCCESS;
}
PROCESS:
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:incomplete_start() %s) "
"------- Not able to recover from incomplete start of job %s! Aborting...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job) ));
/* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: abort in progress, ignoring incomplete start on job %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:base: job %s reported incomplete start with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
orte_job_term_ordered = true;
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* just return - let the daemons report back so we can properly
* know when to actually exit
*/
return ORTE_SUCCESS;
}
int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code)
{
orte_job_t *jdata = NULL;
orte_proc_state_t state = ORTE_PROC_STATE_COMM_FAILED;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
orte_errmgr_base_module_t *module = NULL;
int i;
stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT;
/********************************
* Stabalize the runtime
********************************/
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Communication to Process %s failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name) ));
/* get the job data object for this process */
if (NULL == (jdata = orte_get_job_data_object(name->jobid))) {
/* nothing we can do - abort things */
goto PROCESS;
}
if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) {
goto PROCESS;
}
/********************************
* Call the active modules
********************************/
if( orte_errmgr_base_enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Attempting recovery... (%3d active components)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_errmgr_base_modules.size));
stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED;
for(i = 0; i < orte_errmgr_base_modules.size; ++i) {
module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i);
if( NULL == module ) {
continue;
}
if( NULL != module->process_fault ) {
module->process_fault(jdata, name, state, &stack_state);
}
}
}
/********************************
* If the active modules still need us to abort, then do so
********************************/
if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Successfully recovered from communication fault with process %s! Continuing...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name) ));
return ORTE_SUCCESS;
}
PROCESS:
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:comm_failed() %s) "
"------- Not able to recover from communication fault with process %s! Aborting...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name) ));
/*
* Default action is to abort
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
orte_abnormal_term_ordered = true;
orte_trigger_event(&orte_exit);
return ORTE_SUCCESS;
}
@ -461,19 +140,8 @@ int orte_errmgr_base_predicted_fault(char ***proc_list,
char ***suggested_nodes)
{
orte_errmgr_base_module_t *module = NULL;
int i;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
/*
* If the user did not ask for recovery, then do not process recovery events
*/
if( !orte_errmgr_base_enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:predicted_fault() %s) "
"------- Recovery currently disabled! Skipping...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
return ORTE_SUCCESS;
}
int i, rc;
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:predicted_fault() %s) "
@ -487,7 +155,10 @@ int orte_errmgr_base_predicted_fault(char ***proc_list,
continue;
}
if( NULL != module->predicted_fault ) {
module->predicted_fault(proc_list, node_list, suggested_nodes, &stack_state);
rc = module->predicted_fault(proc_list, node_list, suggested_nodes, &stack_state);
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
break;
}
}
}
@ -499,8 +170,8 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
opal_list_t *node_list)
{
orte_errmgr_base_module_t *module = NULL;
int i;
int stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
int i, rc;
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
/*
* If the user did not ask for recovery, then do not process recovery events
@ -525,7 +196,10 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
continue;
}
if( NULL != module->suggest_map_targets ) {
module->suggest_map_targets(proc, oldnode, node_list, &stack_state);
rc = module->suggest_map_targets(proc, oldnode, node_list, &stack_state);
if (ORTE_SUCCESS != rc || ORTE_ERRMGR_STACK_STATE_COMPLETE & stack_state) {
break;
}
}
}
@ -556,98 +230,59 @@ int orte_errmgr_base_ft_event(int state)
return ORTE_SUCCESS;
}
/*
* Local functions
*/
static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state)
void orte_errmgr_base_update_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state)
{
orte_proc_t *loc_proc=NULL, *child_proc;
orte_std_cntr_t i_proc;
orte_proc_t *loc_proc;
int32_t i;
/* has this already been done */
if (ORTE_ERRMGR_STACK_STATE_UPDATED & *stack_state) {
return;
}
*stack_state |= ORTE_ERRMGR_STACK_STATE_UPDATED;
/*
* orterun is trying to shutdown, so just let it
*/
if( orte_errmgr_base_shutting_down ) {
return ORTE_SUCCESS;
if (orte_errmgr_base_shutting_down) {
return;
}
/*
* orte_errmgr_base_incomplete_start() will pass a NULL since all processes
* are effected by this fault.
* JJH: Since we do not handle the recovery from such errors yet, just
* skip processing, and go to the abort sequence.
*/
if( NULL == proc ) {
return ORTE_SUCCESS;
if (NULL == proc) {
return;
}
/*
* Remove the route to this process
*/
orte_routed.delete_route(proc);
/*
* Set the process state in the job data structure
*/
for(i = 0; i < jdata->procs->size; ++i) {
loc_proc = NULL;
for (i = 0; i < jdata->procs->size; ++i) {
if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if( loc_proc->name.vpid != proc->vpid) {
if (loc_proc->name.vpid != proc->vpid) {
continue;
}
loc_proc->state = state;
if (ORTE_PROC_STATE_UNTERMINATED < state) {
jdata->num_terminated++;
}
break;
}
/*
* RHC: Since we do not handle the recovery from such errors yet, just
* skip processing, and go to the abort sequence.
*/
if (NULL == loc_proc) {
return ORTE_SUCCESS;
}
/*
* If this is a part of the control plane (HNP/orted)
*/
if( proc->jobid == ORTE_PROC_MY_NAME->jobid ) {
/*
* Remove the route to this process
*/
orte_routed.delete_route(proc);
/*
* If the aborted daemon had active processes on its node, then we should
* make sure to signal that all the children are gone.
*/
if( loc_proc->node->num_procs > 0 ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:stabalize_runtime() %s) "
"------- Daemon lost with the following processes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) {
child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc);
if( NULL == child_proc ) {
continue;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output,
"errmgr:base:stabalize_runtime() %s) "
"\t %s [0x%x]",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child_proc->name),
child_proc->state));
if( child_proc->last_errmgr_state < child_proc->state ) {
child_proc->last_errmgr_state = child_proc->state;
orte_errmgr_base_proc_aborted(&child_proc->name, -1);
}
}
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -56,9 +56,7 @@ opal_list_t orte_errmgr_base_components_available;
/* Public module provides a wrapper around previous functions */
orte_errmgr_API_t orte_errmgr = {
orte_errmgr_base_log,
orte_errmgr_base_proc_aborted,
orte_errmgr_base_incomplete_start,
orte_errmgr_base_comm_failed,
orte_errmgr_base_update_state,
orte_errmgr_base_predicted_fault,
orte_errmgr_base_suggest_map_targets,
orte_errmgr_base_abort

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -57,13 +58,6 @@ int orte_errmgr_base_select(void)
opal_pointer_array_t tmp_array;
orte_errmgr_base_module_t *i_module = NULL;
/*
* If the user does not want the recovery features, then do not select any.
*/
if( !orte_errmgr_base_enable_recovery ) {
goto INIT;
}
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_output_verbose(10, orte_errmgr_base_output,
@ -162,7 +156,6 @@ int orte_errmgr_base_select(void)
}
OBJ_DESTRUCT(&tmp_array);
INIT:
/*
* Initialize each of the Errmgr Modules
*/

Просмотреть файл

@ -52,9 +52,11 @@ typedef uint8_t orte_errmgr_cmd_flag_t;
*/
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
ORTE_DECLSPEC int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code);
ORTE_DECLSPEC int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code);
ORTE_DECLSPEC int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code);
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_exit_code_t exit_code);
ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...)
# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 2, 3)
@ -68,6 +70,11 @@ ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
opal_list_t *node_list);
ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state);
ORTE_DECLSPEC void orte_errmgr_base_update_runtime(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_errmgr_stack_state_t *stack_state);
/*
* Additional External API function declared in errmgr.h
*/

Просмотреть файл

@ -75,6 +75,8 @@
#include "orte/mca/plm/plm_types.h"
BEGIN_C_DECLS
/* type definition */
typedef uint8_t orte_errmgr_stack_state_t;
/*
* Macro definitions
@ -109,36 +111,12 @@ typedef void (*orte_errmgr_base_API_log_fn_t)(int error_code, char *filename, in
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
typedef int (*orte_errmgr_base_API_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code);
typedef int (*orte_errmgr_base_API_update_state_fn_t)(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/**
* Alert - incomplete start of a job
* This function is called by the PLM when an attempted launch of a job encounters failure of
* one or more processes to start. The strategy for dealing
* with this "incomplete start" situation varies across the various errmgr components.
*
* This function is only called by the respective process launcher, which is responsible
* for detecting incomplete starts. If on a daemon, the function simply updates the
* process state to indicate failure to launch - this initiates a trigger that goes to
* the respective HNP for response.
*
* NOTE: Errmgr components on non-HNP and non-daemon processes are expressly forbidden
* from taking any action to this function call. Instead, they are restricted to simply
* returning.
*
* @param job Job that failed to start
*
* @retval ORTE_SUCCESS Whatever action that was taken was successful
* @retval ORTE_ERROR Appropriate error code
*/
typedef int (*orte_errmgr_base_API_incomplete_start_fn_t)(orte_jobid_t job, int exit_code);
/**
* If the communication link failed to a peer.
* This gives us a chance to recover from this error, or abort.
*/
typedef int (*orte_errmgr_base_API_comm_failed_fn_t)(orte_process_name_t *name,
int exit_code);
/**
* Predicted process/node failure notification
* Composite interface. Called in priority order.
@ -153,7 +131,6 @@ typedef int (*orte_errmgr_base_API_comm_failed_fn_t)(orte_process_name_t *name,
typedef int (*orte_errmgr_base_API_predicted_fault_fn_t)(char ***proc_list,
char ***node_list,
char ***suggested_nodes);
/**
* Suggest a node to map a restarting process onto
*
@ -185,9 +162,7 @@ __opal_attribute_format__(__printf__, 2, 3)
/* global structure for accessing ERRMGR FRAMEWORK API's */
typedef struct {
orte_errmgr_base_API_log_fn_t log;
orte_errmgr_base_API_proc_aborted_fn_t proc_aborted;
orte_errmgr_base_API_incomplete_start_fn_t incomplete_start;
orte_errmgr_base_API_comm_failed_fn_t comm_failed;
orte_errmgr_base_API_update_state_fn_t update_state;
orte_errmgr_base_API_predicted_fault_fn_t predicted_fault;
orte_errmgr_base_API_suggest_map_targets_fn_t suggest_map_targets;
orte_errmgr_base_API_abort_fn_t abort;
@ -224,18 +199,20 @@ typedef int (*orte_errmgr_base_module_finalize_fn_t)
/*
* Internal Composite Interfaces corresponding to API interfaces
*/
typedef int (*orte_errmgr_base_module_process_fault_fn_t)(orte_job_t *jdata,
orte_process_name_t *proc_name,
orte_proc_state_t state,
int *stack_state);
typedef int (*orte_errmgr_base_module_update_state_fn_t)(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
typedef int (*orte_errmgr_base_module_predicted_fault_fn_t)(char ***proc_list,
char ***node_list,
char ***suggested_nodes,
int *stack_state);
orte_errmgr_stack_state_t *stack_state);
typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
int *stack_state);
orte_errmgr_stack_state_t *stack_state);
/**
* Handle fault tolerance updates
@ -258,10 +235,10 @@ struct orte_errmgr_base_module_2_3_0_t {
orte_errmgr_base_module_finalize_fn_t finalize;
/* -------------- Internal Composite Interfaces -- */
/** Actual process failure notification */
orte_errmgr_base_module_update_state_fn_t update_state;
/** Predicted process/node failure notification */
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
/** Actual process failure notification */
orte_errmgr_base_module_process_fault_fn_t process_fault;
/** Suggest a node to map a restarting process onto */
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;

36
orte/mca/errmgr/hnp/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-errmgr-hnp.txt
sources = \
errmgr_hnp.h \
errmgr_hnp_component.c \
errmgr_hnp.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_hnp_DSO
component_noinst =
component_install = mca_errmgr_hnp.la
else
component_noinst = libmca_errmgr_hnp.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_hnp_la_SOURCES = $(sources)
mca_errmgr_hnp_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_hnp_la_SOURCES =$(sources)
libmca_errmgr_hnp_la_LDFLAGS = -module -avoid-version

22
orte/mca/errmgr/hnp/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,22 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

782
orte/mca/errmgr/hnp/errmgr_hnp.c Обычный файл
Просмотреть файл

@ -0,0 +1,782 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnp.h"
/* Local functions */
static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code);
static void failed_start(orte_job_t *jdata, orte_exit_code_t exit_code);
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state);
static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata);
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes,
orte_errmgr_stack_state_t *stack_state);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
static int ft_event(int state);
/******************
* ORCM module
******************/
orte_errmgr_base_module_t orte_errmgr_hnp_module = {
init,
finalize,
update_state,
predicted_fault,
suggest_map_targets,
ft_event
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
{
orte_job_t *jdata;
orte_exit_code_t sts;
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
/*
* if orterun is trying to shutdown, just let it
*/
if (orte_errmgr_base_shutting_down) {
return ORTE_SUCCESS;
}
if (NULL == proc) {
/* this is an update for an entire local job */
if (ORTE_JOBID_INVALID == job) {
/* whatever happened, we don't know what job
* it happened to
*/
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) {
orte_never_launched = true;
}
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:unknown-job-error",
true, orte_job_state_to_str(jobstate));
hnp_abort(job, exit_code);
return ORTE_SUCCESS;
}
/* get the job object */
if (NULL == (jdata = orte_get_job_data_object(job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* update the state */
jdata->state = jobstate;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:hnp: job %s reported state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(jobstate)));
switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jdata, exit_code);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(job))) {
sts = exit_code;
if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) {
/* set the flag indicating that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system
*/
orte_abnormal_term_ordered = true;
if (WIFSIGNALED(exit_code)) { /* died on signal */
#ifdef WCOREDUMP
if (WCOREDUMP(exit_code)) {
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
WTERMSIG(exit_code));
sts = WTERMSIG(exit_code);
} else {
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
WTERMSIG(exit_code));
sts = WTERMSIG(exit_code);
}
#else
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
WTERMSIG(exit_code));
sts = WTERMSIG(exit_code);
#endif /* WCOREDUMP */
} else {
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
WEXITSTATUS(exit_code));
sts = WEXITSTATUS(exit_code);
}
}
hnp_abort(jdata->jobid, sts);
}
break;
case ORTE_JOB_STATE_RUNNING:
/* update all procs in job */
update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING);
break;
case ORTE_JOB_STATE_NEVER_LAUNCHED:
orte_never_launched = true;
jdata->num_terminated = jdata->num_procs;
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(job))) {
hnp_abort(jdata->jobid, exit_code);
}
break;
default:
break;
}
return ORTE_SUCCESS;
}
/* get the job object */
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* update is for a specific proc */
switch (state) {
case ORTE_PROC_STATE_ABORTED:
case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC:
case ORTE_PROC_STATE_COMM_FAILED:
update_proc(jdata, proc, state, exit_code);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
hnp_abort(jdata->jobid, exit_code);
}
break;
case ORTE_PROC_STATE_FAILED_TO_START:
update_proc(jdata, proc, state, exit_code);
check_job_complete(jdata);
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
hnp_abort(jdata->jobid, exit_code);
}
break;
case ORTE_PROC_STATE_REGISTERED:
case ORTE_PROC_STATE_RUNNING:
update_proc(jdata, proc, state, exit_code);
break;
case ORTE_PROC_STATE_TERMINATED:
case ORTE_PROC_STATE_KILLED_BY_CMD:
update_proc(jdata, proc, state, exit_code);
check_job_complete(jdata);
break;
default:
break;
}
return ORTE_SUCCESS;
}
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes,
orte_errmgr_stack_state_t *stack_state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/
static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
{
int rc;
/* if we are already in progress, then ignore this call */
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:hnp: abort called on job %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
orte_job_term_ordered = true;
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
}
static void failed_start(orte_job_t *jdata, orte_exit_code_t exit_code)
{
opal_list_item_t *item, *next;
orte_odls_job_t *jobdat;
orte_odls_child_t *child;
orte_proc_t *proc;
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == jdata->jobid) {
break;
}
}
if (NULL == jobdat) {
/* race condition - may not have been formed yet */
return;
}
jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
next = opal_list_get_next(item);
child = (orte_odls_child_t*)item;
if (child->name->jobid == jobdat->jobid) {
if (ORTE_PROC_STATE_LAUNCHED > child->state ||
ORTE_PROC_STATE_UNTERMINATED < child->state) {
/* get the master proc object */
proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid);
proc->state = child->state;
proc->exit_code = exit_code;
/* update the counter so we can terminate */
jdata->num_terminated++;
/* remove the child from our list */
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
}
}
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:hnp: job %s reported incomplete start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
}
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state)
{
opal_list_item_t *item, *next;
orte_odls_job_t *jobdat;
orte_odls_child_t *child;
orte_proc_t *proc;
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == jdata->jobid) {
break;
}
}
if (NULL == jobdat) {
/* race condition - may not have been formed yet */
return;
}
jobdat->state = jobstate;
jdata->state = jobstate;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
next = opal_list_get_next(item);
child = (orte_odls_child_t*)item;
if (jdata->jobid == child->name->jobid) {
child->state = state;
proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid);
proc->state = state;
if (ORTE_PROC_STATE_UNTERMINATED < state) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
jdata->num_terminated++;
} else if (ORTE_PROC_STATE_RUNNING) {
jdata->num_launched++;
} else if (ORTE_PROC_STATE_REGISTERED == state) {
jdata->num_reported++;
if (jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
}
}
}
}
}
static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_proc_t *proct;
int i;
/*** UPDATE LOCAL CHILD ***/
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
next = opal_list_get_next(item);
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid) {
if (child->name->vpid == proc->vpid) {
child->state = state;
child->exit_code = exit_code;
proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid);
proct->state = state;
proct->exit_code = exit_code;
if (ORTE_PROC_STATE_UNTERMINATED < state) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
jdata->num_terminated++;
} else if (ORTE_PROC_STATE_RUNNING == state) {
jdata->num_launched++;
if (jdata->num_launched == jdata->num_procs) {
jdata->state = ORTE_JOB_STATE_RUNNING;
}
} else if (ORTE_PROC_STATE_REGISTERED == state) {
jdata->num_reported++;
if (jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
}
}
return;
}
}
}
/*** UPDATE REMOTE CHILD ***/
for (i=0; i < jdata->procs->size; i++) {
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
continue;
}
if (proct->name.jobid != proc->jobid ||
proct->name.vpid != proc->vpid) {
continue;
}
proct->state = state;
proct->exit_code = exit_code;
if (ORTE_PROC_STATE_REGISTERED == state) {
jdata->num_reported++;
if (jdata->num_reported == jdata->num_procs) {
OPAL_RELEASE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
}
} else if (ORTE_PROC_STATE_UNTERMINATED < state) {
/* update the counter so we can terminate */
jdata->num_terminated++;
} else if (ORTE_PROC_STATE_RUNNING == state) {
jdata->num_launched++;
if (jdata->num_launched == jdata->num_procs) {
jdata->state = ORTE_JOB_STATE_RUNNING;
}
}
return;
}
}
static void check_job_complete(orte_job_t *jdata)
{
orte_proc_t *proc;
int i;
orte_std_cntr_t j;
orte_job_t *job;
orte_node_t *node;
orte_job_map_t *map;
orte_std_cntr_t index;
bool one_still_alive;
#if 0
/* Check if FileM is active. If so then keep processing. */
OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active);
#endif
for (i=0; i < jdata->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
/* the proc array may no longer be left justified, so
* we need to check everything
*/
continue;
}
/*
* Determine how the process state affects the job state
*/
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr_hnp:check_job_completed proc %s failed to start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
} else if (ORTE_PROC_STATE_ABORTED == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed proc %s aborted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_ABORTED;
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed proc %s aborted by signal",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed proc %s terminated without sync",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
/* now treat a special case - if the proc exit'd without a required
* sync, it may have done so with a zero exit code. We want to ensure
* that the user realizes there was an error, so in this -one- case,
* we overwrite the process' exit code with the default error code
*/
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed proc %s killed by cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
/* we ordered this proc to die, so it isn't an abnormal termination
* and we don't flag it as such - just check the remaining jobs to
* see if anyone is still alive
*/
if (jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated - now we need to check to see if ALL
* the other jobs have also completed and wakeup if that is true
*/
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD;
}
}
goto CHECK_ALIVE;
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed proc %s terminated and continuous",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
if (!jdata->abort) {
proc->state = ORTE_PROC_STATE_ABORTED;
jdata->state = ORTE_JOB_STATE_ABORTED;
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
}
}
if (ORTE_JOB_STATE_UNTERMINATED > jdata->state &&
jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated */
jdata->state = ORTE_JOB_STATE_TERMINATED;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
}
/* if this job is a continuously operating one, then don't do
* anything further - just return here
*/
if (NULL != jdata &&
(ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls ||
ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) {
goto CHECK_ALIVE;
}
/* if the job that is being checked is the HNP, then we are
* trying to terminate the orteds. In that situation, we
* do -not- check all jobs - we simply notify the HNP
* that the orteds are complete. Also check special case
* if jdata is NULL - we want
* to definitely declare the job done if the orteds
* have completed, no matter what else may be happening.
* This can happen if a ctrl-c hits in the "wrong" place
* while launching
*/
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (jdata->num_terminated >= jdata->num_procs) {
/* orteds are done! */
jdata->state = ORTE_JOB_STATE_TERMINATED;
orte_trigger_event(&orteds_exit);
return;
}
}
/* Release the resources used by this job. Since some errmgrs may want
* to continue using resources allocated to the job as part of their
* fault recovery procedure, we only do this once the job is "complete".
* Note that an aborted/killed job -is- flagged as complete and will
* therefore have its resources released. We need to do this after
* we call the errmgr so that any attempt to restart the job will
* avoid doing so in the exact same place as the current job
*/
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
map = jdata->map;
for (index = 0; index < map->nodes->size; index++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s releasing procs from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
for (i = 0; i < node->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (proc->name.jobid != jdata->jobid) {
/* skip procs from another job */
continue;
}
node->slots_inuse--;
node->num_procs--;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s releasing proc %s from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), node->name));
/* set the entry in the node array to NULL */
opal_pointer_array_set_item(node->procs, i, NULL);
/* release the proc once for the map entry */
OBJ_RELEASE(proc);
}
}
OBJ_RELEASE(map);
jdata->map = NULL;
}
CHECK_ALIVE:
/* now check to see if all jobs are done - release this jdata
* object when we find it
*/
one_still_alive = false;
for (j=1; j < orte_job_data->size; j++) {
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
/* since we are releasing jdata objects as we
* go, we can no longer assume that the job_data
* array is left justified
*/
continue;
}
/* if this is the job we are checking AND it normally terminated,
* then go ahead and release it. We cannot release it if it
* abnormally terminated as mpirun needs the info so it can
* report appropriately to the user
*/
if (NULL != jdata && job->jobid == jdata->jobid &&
(jdata->state == ORTE_JOB_STATE_TERMINATED ||
jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) {
/* release this object, ensuring that the
* pointer array internal accounting
* is maintained!
*/
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
OBJ_RELEASE(jdata);
continue;
}
/* if the job is flagged to not be monitored, skip it */
if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) {
continue;
}
/* when checking for job termination, we must be sure to NOT check
* our own job as it - rather obviously - has NOT terminated!
*/
if (job->num_terminated < job->num_procs) {
/* we have at least one job that is not done yet - we cannot
* just return, though, as we need to ensure we cleanout the
* job data for the job that just completed
*/
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid),
job->num_terminated, job->num_procs));
one_still_alive = true;
}
else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [0x%x])",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid),
job->num_terminated, job->num_procs, jdata->state ));
}
}
/* if a job is still alive, we just return */
if (one_still_alive) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed at least one job is not terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return;
}
/* if we get here, then all jobs are done, so wakeup */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:hnp:check_job_completed all jobs terminated - waking up",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* set the exit status to 0 - this will only happen if it
* wasn't already set by an error condition
*/
ORTE_UPDATE_EXIT_STATUS(0);
orte_trigger_event(&orte_exit);
}

35
orte/mca/errmgr/hnp/errmgr_hnp.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_HNP_EXPORT_H
#define MCA_ERRMGR_HNP_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_hnp_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_hnp_module;
END_C_DECLS
#endif /* MCA_ERRMGR_HNP_EXPORT_H */

91
orte/mca/errmgr/hnp/errmgr_hnp_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,91 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_hnp.h"
/*
* Public string for version number
*/
const char *orte_errmgr_hnp_component_version_string =
"ORTE ERRMGR hnp MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_hnp_open(void);
static int errmgr_hnp_close(void);
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_hnp_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component ithnp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnp",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_hnp_open,
errmgr_hnp_close,
errmgr_hnp_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
1
};
static int errmgr_hnp_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_hnp_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_HNP) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 10;
*module = (mca_base_module_t *)&orte_errmgr_hnp_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -0,0 +1,19 @@
-*- text -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE Errmgr HNP module.
#
[errmgr-hnp:unknown-job-error]
An error has occurred in an unknown job. This generally should not happen
except due to an internal ORTE error.
Job state: %s
This information should probably be reported to the OMPI developers.

36
orte/mca/errmgr/orted/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-errmgr-orted.txt
sources = \
errmgr_orted.h \
errmgr_orted_component.c \
errmgr_orted.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_errmgr_orted_DSO
component_noinst =
component_install = mca_errmgr_orted.la
else
component_noinst = libmca_errmgr_orted.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_orted_la_SOURCES = $(sources)
mca_errmgr_orted_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_orted_la_SOURCES =$(sources)
libmca_errmgr_orted_la_LDFLAGS = -module -avoid-version

22
orte/mca/errmgr/orted/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,22 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
PARAM_CONFIG_FILES="Makefile"

665
orte/mca/errmgr/orted/errmgr_orted.c Обычный файл
Просмотреть файл

@ -0,0 +1,665 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_orted.h"
/* Local functions */
static bool any_live_children(orte_jobid_t job);
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat);
static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child);
static bool all_children_registered(orte_jobid_t job);
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf);
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state);
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes,
orte_errmgr_stack_state_t *stack_state);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state);
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state);
static int ft_event(int state);
/******************
* ORCM module
******************/
orte_errmgr_base_module_t orte_errmgr_orted_module = {
init,
finalize,
update_state,
predicted_fault,
suggest_map_targets,
ft_event
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code,
orte_errmgr_stack_state_t *stack_state)
{
opal_list_item_t *item, *next;
orte_odls_job_t *jobdat;
orte_odls_child_t *child;
opal_buffer_t alert;
orte_plm_cmd_flag_t cmd;
int rc;
orte_vpid_t null=ORTE_VPID_INVALID;
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
/*** UPDATE COMMAND FOR A JOB ***/
if (NULL == proc) {
/* this is an update for an entire job */
if (ORTE_JOBID_INVALID == job) {
/* whatever happened, we don't know what job
* it happened to
*/
orte_show_help("help-orte-errmgr-orted.txt", "errmgr-orted:unknown-job-error",
true, orte_job_state_to_str(jobstate));
OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the "invalid" jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
} else {
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
return rc;
}
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == job) {
break;
}
}
if (NULL == jobdat) {
return ORTE_ERR_NOT_FOUND;
}
switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jobdat, exit_code);
break;
case ORTE_JOB_STATE_RUNNING:
/* update all local child states */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
break;
default:
break;
}
OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack the job info */
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) {
ORTE_ERROR_LOG(rc);
}
/* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
} else {
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
return rc;
}
/*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/
if (ORTE_PROC_STATE_TERMINATED < state) {
/* if the job hasn't completed and the state is abnormally
* terminated, then we need to alert the HNP right away
*/
OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack only the data for this proc - have to start with the jobid
* so the receiver can unpack it correctly
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &proc->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* find this proc in the local children */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
}
/* now pack the child's info */
if (ORTE_SUCCESS != (rc = pack_state_for_proc(&alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* remove the child from our local list as it is no longer alive */
opal_list_remove_item(&orte_local_children, &child->super);
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:orted reporting proc %s aborted to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* release the child object */
OBJ_RELEASE(child);
/* done with loop */
break;
}
}
/* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
} else {
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
return rc;
}
/* find this proc in the local children so we can update its state */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
}
/* done with loop */
break;
}
}
if (ORTE_PROC_STATE_REGISTERED == state) {
/* see if everyone in this job has registered */
if (all_children_registered(proc->jobid)) {
/* once everyone registers, send their contact info to
* the HNP so it is available to debuggers and anyone
* else that needs it
*/
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:orted: sending contact info to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* pack init routes command */
cmd = ORTE_PLM_INIT_ROUTES_CMD;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &proc->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack all the local child vpids */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->vpid, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
}
}
/* pack an invalid marker */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* add in contact info for all procs in the job */
if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, &alert))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&alert);
return rc;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
} else {
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
}
return rc;
}
/* only other state is terminated - see if anyone is left alive */
if (!any_live_children(proc->jobid)) {
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
}
}
if (NULL == jobdat) {
/* race condition - may not have been formed yet */
return ORTE_SUCCESS;
}
OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* pack update state command */
cmd = ORTE_PLM_UPDATE_PROC_STATE;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack the data for the job */
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) {
ORTE_ERROR_LOG(rc);
}
FINAL_CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_output,
"%s errmgr:orted reporting all procs in %s terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobdat->jobid)));
/* remove all of this job's children from the global list - do not lock
* the thread as we are already locked
*/
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
if (jobdat->jobid == child->name->jobid) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
}
}
/* ensure the job's local session directory tree is removed */
orte_session_dir_cleanup(jobdat->jobid);
/* remove this job from our local job data since it is complete */
opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
OBJ_RELEASE(jobdat);
/* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
} else {
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
/* indicate that the job is complete */
return ORTE_ERR_SILENT;
}
return ORTE_SUCCESS;
}
static int predicted_fault(char ***proc_list,
char ***node_list,
char ***suggested_nodes,
orte_errmgr_stack_state_t *stack_state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
static int suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list,
orte_errmgr_stack_state_t *stack_state)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
int ft_event(int state)
{
return ORTE_SUCCESS;
}
/*****************
* Local Functions
*****************/
static bool any_live_children(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
child->alive) {
return true;
}
}
/* if we get here, then nobody is left alive from that job */
return false;
}
static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
{
int rc;
/* pack the child's vpid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the pid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the proc was launched */
if (orte_timing) {
int64_t tmp;
tmp = child->starttime.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = child->starttime.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack its state */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack its exit code */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static bool all_children_registered(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* if this child has terminated, we consider it as having
* registered for the purposes of this function. If it never
* did register, then we will send a NULL rml_uri back to
* the HNP, which will then know that the proc did not register.
* If other procs did register, then the HNP can declare an
* abnormal termination
*/
if (ORTE_PROC_STATE_UNTERMINATED < child->state) {
/* this proc has terminated somehow - consider it
* as registered for now
*/
continue;
}
/* if this child is *not* registered yet, return false */
if (!child->init_recvd) {
return false;
}
/* if this child has registered a finalize, return false */
if (child->fini_recvd) {
return false;
}
}
}
/* if we get here, then everyone in the job is currently registered */
return true;
}
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
{
opal_list_item_t *item;
orte_odls_child_t *child;
int rc;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* pack the child's vpid - must be done in case rml_uri is NULL */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the contact info */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
return ORTE_SUCCESS;
}
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* set the state */
jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == jobdat->jobid) {
if (ORTE_PROC_STATE_LAUNCHED > child->state ||
ORTE_PROC_STATE_FAILED_TO_START == child->state) {
/* this proc never launched - flag that the iof
* is complete or else we will hang waiting for
* pipes to close that were never opened
*/
child->iof_complete = true;
/* ditto for waitpid */
child->waitpid_recvd = true;
}
}
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:hnp: job %s reported incomplete start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobdat->jobid)));
return;
}
static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* update job state */
jobdat->state = jobstate;
/* update children */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (jobdat->jobid == child->name->jobid) {
child->state = state;
}
}
}

35
orte/mca/errmgr/orted/errmgr_orted.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_ORTED_EXPORT_H
#define MCA_ERRMGR_ORTED_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_orted_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orted_module;
END_C_DECLS
#endif /* MCA_ERRMGR_ORTED_EXPORT_H */

Просмотреть файл

@ -0,0 +1,91 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_orted.h"
/*
* Public string for version number
*/
const char *orte_errmgr_orted_component_version_string =
"ORTE ERRMGR orted MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_orted_open(void);
static int errmgr_orted_close(void);
static int errmgr_orted_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_orted_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itorted
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"orted",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_orted_open,
errmgr_orted_close,
errmgr_orted_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
1
};
static int errmgr_orted_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_orted_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_orted_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_DAEMON) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 10;
*module = (mca_base_module_t *)&orte_errmgr_orted_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -0,0 +1,14 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE RecoS IGNORE framework.
#

Просмотреть файл

@ -57,6 +57,7 @@
#include "orte/util/nidmap.h"
#include "orte/util/regex.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/notifier/base/base.h"
#include "orte/mca/rmcast/base/base.h"
#include "orte/mca/state/base/base.h"
@ -225,6 +226,21 @@ int orte_ess_base_orted_setup(char **hosts)
goto error;
}
/* set the communication function */
orte_comm = orte_global_comm;
/* open/select the errmgr */
if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_select";
goto error;
}
/* initialize the nidmaps */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(ret);
@ -408,17 +424,11 @@ error:
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
/* cleanup the global list of local children and job data */
OBJ_DESTRUCT(&orte_local_children);
OBJ_DESTRUCT(&orte_local_jobdata);
return ret;
}
int orte_ess_base_orted_finalize(void)
{
opal_list_item_t *item;
/* ensure all the orteds depart together */
if (!orte_abnormal_term_ordered) {
/* if we are abnormally terminating, don't attempt
@ -455,16 +465,6 @@ int orte_ess_base_orted_finalize(void)
orte_routed_base_close();
orte_rml_base_close();
/* cleanup the global list of local children and job data */
while (NULL != (item = opal_list_remove_first(&orte_local_children))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_local_children);
while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_local_jobdata);
/* cleanup any lingering session directories */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

Просмотреть файл

@ -115,10 +115,6 @@ static int rte_init(void)
orte_proc_t *proc;
int value;
/* initialize the global list of local children and job data */
OBJ_CONSTRUCT(&orte_local_children, opal_list_t);
OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t);
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
error = "orte_ess_base_std_prolog";
@ -325,6 +321,9 @@ static int rte_init(void)
goto error;
}
/* set the communication function */
orte_comm = orte_global_comm;
/* we are an hnp, so update the contact info field for later use */
orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
@ -577,17 +576,12 @@ error:
true, error, ORTE_ERROR_NAME(ret), ret);
}
/* cleanup the global list of local children and job data */
OBJ_DESTRUCT(&orte_local_children);
OBJ_DESTRUCT(&orte_local_jobdata);
return ret;
}
static int rte_finalize(void)
{
char *contact_path;
opal_list_item_t *item;
orte_node_t *node;
orte_job_t *job;
int i;
@ -637,16 +631,6 @@ static int rte_finalize(void)
}
}
/* cleanup the global list of local children and job data */
while (NULL != (item = opal_list_remove_first(&orte_local_children))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_local_children);
while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_local_jobdata);
/* cleanup the job and node info arrays */
if (NULL != orte_node_pool) {
for (i=0; i < orte_node_pool->size; i++) {

Просмотреть файл

@ -46,6 +46,7 @@
#include "orte/constants.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"

Просмотреть файл

@ -30,6 +30,7 @@
#include "opal/util/show_help.h"
#include "orte/util/error_strings.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/notifier/base/base.h"
#include "notifier_ftb.h"

Просмотреть файл

@ -41,6 +41,7 @@
#include "orte/constants.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"

Просмотреть файл

@ -34,6 +34,7 @@
#include "opal/util/show_help.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/base/base.h"

Просмотреть файл

@ -46,6 +46,7 @@
#include "orte/version.h"
#include "orte/constants.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"

Просмотреть файл

@ -41,13 +41,6 @@ ORTE_DECLSPEC int orte_odls_base_open(void);
#if !ORTE_DISABLE_FULL_SUPPORT
typedef void (*orte_odls_base_cbfunc_t)(int fd, short event, void *data);
typedef int (*orte_odls_base_comm_fn_t)(orte_process_name_t *recipient,
opal_buffer_t *buf,
orte_rml_tag_t tag,
orte_odls_base_cbfunc_t cbfunc);
/**
* Struct to hold globals for the odls framework
*/
@ -60,8 +53,6 @@ typedef struct orte_odls_base_t {
opal_list_t available_components;
/** selected component */
orte_odls_base_component_t selected_component;
/* comm fn for updating state */
orte_odls_base_comm_fn_t comm;
/* warn if binding no-op */
bool warn_if_not_bound;
} orte_odls_base_t;
@ -93,10 +84,6 @@ ORTE_DECLSPEC void orte_base_default_waitpid_fired(orte_process_name_t *proc, in
/* setup singleton job data */
ORTE_DECLSPEC void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid);
ORTE_DECLSPEC int orte_odls_base_comm(orte_process_name_t *recipient,
opal_buffer_t *buf, orte_rml_tag_t tag,
orte_odls_base_cbfunc_t cbfunc);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -24,6 +24,7 @@
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_list.h"
#include "opal/threads/threads.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
@ -46,6 +47,20 @@ int orte_odls_base_close(void)
free(orte_odls_globals.dmap);
}
/* cleanup the global list of local children and job data */
while (NULL != (item = opal_list_remove_first(&orte_local_children))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_local_children);
OBJ_DESTRUCT(&orte_local_children_lock);
OBJ_DESTRUCT(&orte_local_children_cond);
while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_local_jobdata);
OBJ_DESTRUCT(&orte_local_jobdata_lock);
OBJ_DESTRUCT(&orte_local_jobdata_cond);
/* cleanup the sysinfo data */
while (NULL != (item = opal_list_remove_first(&orte_odls_globals.sysinfo))) {
OBJ_RELEASE(item);

Просмотреть файл

@ -77,8 +77,6 @@
#include "orte/mca/odls/base/base.h"
#include "orte/mca/odls/base/odls_private.h"
static void check_proc_complete(orte_odls_child_t *child);
/* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
* THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
*/
@ -588,7 +586,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
orte_process_name_t proc;
orte_odls_job_t *jobdat=NULL;
opal_byte_object_t *bo;
opal_buffer_t alert;
opal_list_item_t *item;
int8_t flag;
orte_app_idx_t *app_idx=NULL;
@ -601,6 +598,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
"%s odls:constructing child list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
*job = ORTE_JOBID_INVALID;
/* unpack the flag for regexp */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
@ -1014,19 +1013,9 @@ REPORT_ERROR:
* for it to happen - especially so developers don't have to
* deal with the hang!
*/
OBJ_CONSTRUCT(&alert, opal_buffer_t);
*job = ORTE_JOBID_INVALID;
opal_dss.pack(&alert, job, 1, ORTE_JOBID);
/* send it */
if (ORTE_SUCCESS != (rc = orte_odls_base.comm(ORTE_PROC_MY_HNP,
&alert, ORTE_RML_TAG_APP_LAUNCH_CALLBACK,
orte_plm_base_app_report_launch))) {
ORTE_ERROR_LOG(rc);
}
/* cleanup */
OBJ_DESTRUCT(&alert);
orte_errmgr.update_state(*job, ORTE_JOB_STATE_NEVER_LAUNCHED,
NULL, ORTE_PROC_STATE_UNDEF, rc);
if (NULL != app_idx) {
free(app_idx);
app_idx = NULL;
@ -1165,98 +1154,6 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
return ORTE_SUCCESS;
}
static int pack_state_for_proc(opal_buffer_t *alert, bool include_startup_info, orte_odls_child_t *child)
{
int rc;
/* pack the child's vpid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack startup info if we need to report it */
if (include_startup_info) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the proc was launched */
if (orte_timing) {
int64_t tmp;
tmp = child->starttime.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = child->starttime.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* pack its state */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack its exit code */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int pack_state_update(opal_buffer_t *alert, bool include_startup_info, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (include_startup_info && orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, include_startup_info, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
/* define a timer release point so that we can wait for
* file descriptors to come available, if necessary
*/
@ -1284,7 +1181,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
orte_odls_child_t *child=NULL;
int num_processors;
bool oversubscribed;
int rc=ORTE_SUCCESS, ret;
int rc=ORTE_SUCCESS;
bool launch_failed=true;
opal_buffer_t alert;
orte_std_cntr_t proc_rank;
@ -1887,6 +1784,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
* across the entire cluster. Instead, we let orterun
* output a consolidated error message for us
*/
child->exit_code = rc;
goto CLEANUP;
} else {
child->alive = true;
@ -1929,23 +1827,10 @@ CLEANUP:
* that didn't launch as having failed, or else we will hang
*/
if (launch_failed) {
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == jobdat->jobid) {
if (ORTE_PROC_STATE_LAUNCHED >= child->state) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
} else if (ORTE_PROC_STATE_FAILED_TO_START == child->state) {
/* this proc never started - flag that the iof
* is complete or else we will hang waiting for
* pipes to close that were never opened
*/
child->iof_complete = true;
/* ditto for waitpid */
child->waitpid_recvd = true;
}
}
if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(jobdat->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
child->exit_code))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* if the launch succeeded, check to see if we need to
@ -1967,22 +1852,13 @@ CLEANUP:
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL, orte_odls_globals.debugger);
orte_odls_globals.debugger_launched = true;
}
}
/* pack the launch results */
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
ORTE_ERROR_LOG(ret);
}
/* send it */
if (ORTE_SUCCESS != (rc = orte_odls_base.comm(ORTE_PROC_MY_HNP,
&alert, ORTE_RML_TAG_APP_LAUNCH_CALLBACK,
orte_plm_base_app_report_launch))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&alert);
if (!launch_failed) {
if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(jobdat->jobid, ORTE_JOB_STATE_RUNNING,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE))) {
ORTE_ERROR_LOG(rc);
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch setting waitpids",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -1993,14 +1869,12 @@ CLEANUP:
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (ORTE_PROC_STATE_LAUNCHED == child->state) {
child->state = ORTE_PROC_STATE_RUNNING;
if (child->name->jobid == jobdat->jobid) {
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
}
}
}
opal_condition_signal(&orte_odls_globals.cond);
@ -2115,81 +1989,6 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i
return ORTE_ERR_NOT_FOUND;
}
static bool all_children_registered(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* if this child has terminated, we consider it as having
* registered for the purposes of this function. If it never
* did register, then we will send a NULL rml_uri back to
* the HNP, which will then know that the proc did not register.
* If other procs did register, then the HNP can declare an
* abnormal termination
*/
if (ORTE_PROC_STATE_UNTERMINATED < child->state) {
/* this proc has terminated somehow - consider it
* as registered for now
*/
continue;
}
/* if this child is *not* registered yet, return false */
if (!child->init_recvd) {
return false;
}
/* if this child has registered a finalize, return false */
if (child->fini_recvd) {
return false;
}
}
}
/* if we get here, then everyone in the job is currently registered */
return true;
}
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
{
opal_list_item_t *item;
orte_odls_child_t *child;
int rc;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* pack the child's vpid - must be done in case rml_uri is NULL */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the contact info */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
return ORTE_SUCCESS;
}
void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid)
{
orte_odls_job_t *jobdat;
@ -2395,38 +2194,9 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
goto CLEANUP;
}
/* now check to see if everyone in this job has registered */
if (all_children_registered(proc->jobid)) {
/* once everyone registers, send their contact info to
* the HNP so it is available to debuggers and anyone
* else that needs it
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls: sending contact info to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
/* store jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &proc->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
goto CLEANUP;
}
/* add in contact info for all procs in the job */
if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, &buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
goto CLEANUP;
}
/* send it */
if (ORTE_SUCCESS != (rc = orte_odls_base.comm(ORTE_PROC_MY_HNP,
&buffer, ORTE_RML_TAG_INIT_ROUTES,
orte_routed_base_process_msg))) {
ORTE_ERROR_LOG(rc);
}
OBJ_DESTRUCT(&buffer);
}
/* update the proc state */
orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF,
proc, ORTE_PROC_STATE_REGISTERED, 0);
CLEANUP:
opal_condition_signal(&orte_odls_globals.cond);
@ -2434,174 +2204,6 @@ CLEANUP:
return rc;
}
static bool any_live_children(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
child->alive) {
return true;
}
}
/* if we get here, then nobody is left alive from that job */
return false;
}
static void check_proc_complete(orte_odls_child_t *child)
{
int rc;
opal_buffer_t alert;
orte_plm_cmd_flag_t cmd=ORTE_PLM_UPDATE_PROC_STATE;
opal_list_item_t *item, *next;
orte_odls_job_t *jdat;
/* is this proc fully complete? */
if (!child->waitpid_recvd || !child->iof_complete) {
/* apparently not - just return */
return;
}
/* CHILD IS COMPLETE */
child->alive = false;
/* Release only the stdin IOF file descriptor for this child, if one
* was defined. File descriptors for the other IOF channels - stdout,
* stderr, and stddiag - were released when their associated pipes
* were cleared and closed due to termination of the process
*/
orte_iof.close(child->name, ORTE_IOF_STDIN);
/* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally
* and didn't cleanup its own session directory.
*/
orte_session_dir_finalize(child->name);
/* setup the alert buffer */
OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* find the jobdat */
jdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jdat->jobid == child->name->jobid) {
break;
}
}
if (NULL == jdat) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto unlock;
}
/* decrement the num_local_procs as this one is complete */
jdat->num_local_procs--;
/* if the proc aborted, tell the HNP right away */
if (ORTE_PROC_STATE_TERMINATED != child->state) {
/* pack update state command */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto unlock;
}
/* pack only the data for this proc - have to start with the jobid
* so the receiver can unpack it correctly
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto unlock;
}
/* now pack the child's info */
if (ORTE_SUCCESS != (rc = pack_state_for_proc(&alert, false, child))) {
ORTE_ERROR_LOG(rc);
goto unlock;
}
/* remove the child from our local list as it is no longer alive */
opal_list_remove_item(&orte_local_children, &child->super);
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:proc_complete reporting proc %s aborted to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* release the child object */
OBJ_RELEASE(child);
/* send it */
if (ORTE_SUCCESS != (rc = orte_odls_base.comm(ORTE_PROC_MY_HNP,
&alert, ORTE_RML_TAG_PLM,
orte_plm_base_receive_process_msg))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* since it didn't abort, let's see if all of that job's procs are done */
if (!any_live_children(child->name->jobid)) {
/* all those children are dead - alert the HNP */
/* pack update state command */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) {
ORTE_ERROR_LOG(rc);
goto unlock;
}
/* pack the data for the job */
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, false, jdat))) {
ORTE_ERROR_LOG(rc);
goto unlock;
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:proc_complete reporting all procs in %s terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdat->jobid)));
/* remove all of this job's children from the global list - do not lock
* the thread as we are already locked
*/
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
if (jdat->jobid == child->name->jobid) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
}
}
/* ensure the job's local session directory tree is removed */
orte_session_dir_cleanup(jdat->jobid);
/* remove this job from our local job data since it is complete */
opal_list_remove_item(&orte_local_jobdata, &jdat->super);
OBJ_RELEASE(jdat);
/* send it */
if (ORTE_SUCCESS != (rc = orte_odls_base.comm(ORTE_PROC_MY_HNP,
&alert, ORTE_RML_TAG_PLM,
orte_plm_base_receive_process_msg))) {
ORTE_ERROR_LOG(rc);
}
}
}
unlock:
OBJ_DESTRUCT(&alert);
}
/* receive external-to-odls notification that a proc has met some completion
* requirements
*/
@ -2609,6 +2211,7 @@ void orte_odls_base_notify_iof_complete(orte_process_name_t *proc)
{
orte_odls_child_t *child;
opal_list_item_t *item;
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:notify_iof_complete for child %s",
@ -2652,7 +2255,29 @@ GOTCHILD:
/* flag the iof as complete */
child->iof_complete = true;
/* now check to see if the proc is truly done */
check_proc_complete(child);
if (child->waitpid_recvd) {
/* CHILD IS COMPLETE */
child->alive = false;
/* Release only the stdin IOF file descriptor for this child, if one
* was defined. File descriptors for the other IOF channels - stdout,
* stderr, and stddiag - were released when their associated pipes
* were cleared and closed due to termination of the process
*/
orte_iof.close(proc, ORTE_IOF_STDIN);
/* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally
* and didn't cleanup its own session directory.
*/
orte_session_dir_finalize(proc);
/* alert the errmgr */
if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF,
proc, child->state, child->exit_code))) {
ORTE_ERROR_LOG(rc);
}
}
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
}
@ -2665,6 +2290,11 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
struct stat buf;
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired on child %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
* by other threads. This will also be used to protect us
@ -2849,8 +2479,29 @@ MOVEON:
/* indicate the waitpid fired */
child->waitpid_recvd = true;
/* check for everything complete */
check_proc_complete(child);
/* now check to see if the proc is truly done */
if (child->iof_complete) {
/* CHILD IS COMPLETE */
child->alive = false;
/* Release only the stdin IOF file descriptor for this child, if one
* was defined. File descriptors for the other IOF channels - stdout,
* stderr, and stddiag - were released when their associated pipes
* were cleared and closed due to termination of the process
*/
orte_iof.close(proc, ORTE_IOF_STDIN);
/* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally
* and didn't cleanup its own session directory.
*/
orte_session_dir_finalize(proc);
/* alert the errmgr */
if (ORTE_SUCCESS != (rc = orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF,
proc, child->state, child->exit_code))) {
ORTE_ERROR_LOG(rc);
}
}
/* done */
opal_condition_signal(&orte_odls_globals.cond);
@ -3061,7 +2712,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
* channels will automatically close when the proc is killed
*/
orte_iof.close(child->name, ORTE_IOF_STDIN);
/* cancel the waitpid callback as this induces unmanageable race
* conditions when we are deliberately killing the process
*/
@ -3075,7 +2726,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
/* Send a sigterm to the process before sigkill to be nice */
kill_local(child->pid, SIGTERM);
/* check to see if it died - the child_died function will continue
* to check every microsecond until we reach the timeout
*/
@ -3093,7 +2744,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
"%s odls:kill_local_proc child %s killed",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* indicate the waitpid fired as this is effectively what
* has happened
*/
@ -3107,7 +2758,14 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
/* check for everything complete - this will remove
* the child object from our local list
*/
check_proc_complete(child);
if (child->iof_complete && child->waitpid_recvd) {
if (ORTE_ERR_SILENT == orte_errmgr.update_state(ORTE_JOBID_INVALID, ORTE_JOB_STATE_UNDEF,
child->name, child->state,
child->exit_code)) {
/* all procs are complete - we are done */
break;
}
}
}
}
@ -3122,7 +2780,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
*/
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
return rc;
}
@ -3183,26 +2841,3 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
return ORTE_SUCCESS;
}
int orte_odls_base_comm(orte_process_name_t *recipient,
opal_buffer_t *buf, orte_rml_tag_t tag,
orte_odls_base_cbfunc_t cbfunc)
{
int ret;
if (recipient->jobid == ORTE_PROC_MY_NAME->jobid &&
recipient->vpid == ORTE_PROC_MY_NAME->vpid &&
NULL != cbfunc) {
/* if I am the recipient and a direct fn is provided, use a message event */
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, buf, tag, cbfunc);
ret = ORTE_SUCCESS;
} else {
/* go ahead and send it */
if (0 > (ret = orte_rml.send_buffer(recipient, buf, tag, 0))) {
ORTE_ERROR_LOG(ret);
} else {
ret = ORTE_SUCCESS;
}
}
return ret;
}

Просмотреть файл

@ -33,8 +33,7 @@
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/argv.h"
#include "opal/threads/mutex.h"
#include "opal/threads/condition.h"
#include "opal/threads/threads.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/util/name_fns.h"
@ -207,6 +206,14 @@ int orte_odls_base_open(void)
false, false, 1, &i);
orte_odls_base.warn_if_not_bound = OPAL_INT_TO_BOOL(i);
/* initialize the global list of local children and job data */
OBJ_CONSTRUCT(&orte_local_children, opal_list_t);
OBJ_CONSTRUCT(&orte_local_children_lock, opal_mutex_t);
OBJ_CONSTRUCT(&orte_local_children_cond, opal_condition_t);
OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t);
OBJ_CONSTRUCT(&orte_local_jobdata_lock, opal_mutex_t);
OBJ_CONSTRUCT(&orte_local_jobdata_cond, opal_condition_t);
/* initialize ODLS globals */
OBJ_CONSTRUCT(&orte_odls_globals.mutex, opal_mutex_t);
OBJ_CONSTRUCT(&orte_odls_globals.cond, opal_condition_t);
@ -216,7 +223,6 @@ int orte_odls_base_open(void)
orte_odls_globals.debugger = NULL;
orte_odls_globals.debugger_launched = false;
OBJ_CONSTRUCT(&orte_odls_globals.sysinfo, opal_list_t);
orte_odls_base.comm = orte_odls_base_comm;
/* get any external processor bindings */
OPAL_PAFFINITY_CPU_ZERO(orte_odls_globals.my_cores);

Просмотреть файл

@ -386,8 +386,6 @@ launch_apps:
goto cleanup;
}
/* JMS: should we stash the alps pid in the gpr somewhere for cleanup? */
cleanup:
if (NULL != argv) {
opal_argv_free(argv);
@ -402,7 +400,9 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;

Просмотреть файл

@ -81,7 +81,6 @@ ORTE_DECLSPEC int orte_plm_base_close(void);
*/
ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_check_job_completed(orte_job_t *jdata);
#endif /* ORTE_DISABLE_FULL_SUPPORT */

Просмотреть файл

@ -24,7 +24,7 @@ any mechanism to launch proceses, and therefore is unable to start the
process(es) required by your application.
#
[daemon-died-no-signal]
A daemon (pid %s) died unexpectedly with status %d while attempting
A daemon died unexpectedly with status %d while attempting
to launch so we are aborting.
There may be more information reported by the environment (see above).
@ -35,7 +35,7 @@ location of the shared libraries on the remote nodes and this will
automatically be forwarded to the remote nodes.
#
[daemon-died-signal-core]
A daemon (pid %s) died unexpectedly on signal %d (with core) while
A daemon died unexpectedly on signal %d (with core) while
attempting to launch so we are aborting.
There may be more information reported by the environment (see above).
@ -46,7 +46,7 @@ location of the shared libraries on the remote nodes and this will
automatically be forwarded to the remote nodes.
#
[daemon-died-signal]
A daemon (pid %s) died unexpectedly on signal %d while attempting to
A daemon died unexpectedly on signal %d while attempting to
launch so we are aborting.
There may be more information reported by the environment (see above).

Просмотреть файл

@ -125,8 +125,8 @@ static void check_heartbeat(int fd, short dummy, void *arg)
/* if any daemon died, abort */
if (died) {
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1,
ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_ABORTED);
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_ABORTED,
NULL, ORTE_PROC_STATE_UNDEF, ORTE_ERROR_DEFAULT_EXIT_CODE);
return;
}

Просмотреть файл

@ -66,10 +66,6 @@
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/base.h"
static bool active_job_completed_callback = false;
static char *pretty_print_timing(int64_t secs, int64_t usecs);
int orte_plm_base_setup_job(orte_job_t *jdata)
{
orte_job_t *jdatorted;
@ -232,7 +228,21 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
return ORTE_SUCCESS;
}
static struct timeval app_launch_start, app_launch_stop, launch_msg_sent;
static struct timeval app_launch_start, app_launch_stop;
static opal_event_t *dmn_report_ev=NULL;
bool app_launch_failed;
/* catch timeout to allow cmds to progress */
static void timer_cb(int fd, short event, void *cbdata)
{
/* free event */
if (NULL != dmn_report_ev) {
free(dmn_report_ev);
dmn_report_ev = NULL;
}
/* declare time is up */
app_launch_failed = true;
}
int orte_plm_base_launch_apps(orte_jobid_t job)
{
@ -287,7 +297,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
/* if we are timing, record the time we send this message */
if (orte_timing) {
gettimeofday(&launch_msg_sent, NULL);
gettimeofday(&jdata->launch_msg_sent, NULL);
}
/* send the command to the daemons */
@ -299,8 +309,18 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
}
OBJ_RELEASE(buffer);
/* setup a timer - if we don't launch within the
* defined time, then we know things have failed
*/
if (0 < orte_startup_timeout) {
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
}
/* wait for all the daemons to report apps launched */
if (ORTE_SUCCESS != (rc = orte_plm_base_report_launched(job))) {
app_launch_failed = false;
ORTE_PROGRESSED_WAIT(app_launch_failed, jdata->num_launched, jdata->num_procs);
if (ORTE_JOB_STATE_RUNNING != jdata->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -310,12 +330,14 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
if (orte_timing) {
int64_t maxsec, maxusec;
char *tmpstr;
gettimeofday(&app_launch_stop, NULL);
/* subtract starting time to get time in microsecs for test */
maxsec = app_launch_stop.tv_sec - app_launch_start.tv_sec;
maxusec = app_launch_stop.tv_usec - app_launch_start.tv_usec;
fprintf(orte_timing_output, "Time to launch apps: %s\n", pretty_print_timing(maxsec, maxusec));
tmpstr = orte_pretty_print_timing(maxsec, maxusec);
fprintf(orte_timing_output, "Time to launch apps: %s\n", tmpstr);
free(tmpstr);
}
/* complete wiring up the iof */
@ -352,102 +374,6 @@ static bool orted_failed_launch;
static orte_job_t *jdatorted;
static struct timeval daemonlaunchtime = {0,0}, daemonsetuptime = {0,0}, daemoncbtime = {0,0};
void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
int status, orte_job_state_t state)
{
orte_job_t *jdata;
char *pidstr;
int sts;
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:base:launch_failed abort in progress, ignoring report",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch_failed for job %s, status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), status));
/* no matter what, we must exit with a non-zero status */
if (0 == status) {
sts = ORTE_ERROR_DEFAULT_EXIT_CODE;
} else {
sts = status;
}
/* if we didn't even attempt to launch, then just quietly update
* the job record and leave
*/
if (ORTE_JOB_NEVER_LAUNCHED == state) {
orte_never_launched = true;
goto PROCESS;
}
/* if this is the daemon job that failed, set the flag indicating
* that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system
*/
if (ORTE_PROC_MY_NAME->jobid == job) {
/* set the flag indicating that a daemon failed so we use the proper
* methods for attempting to shutdown the rest of the system
*/
orte_abnormal_term_ordered = true;
if (0 < pid) {
asprintf(&pidstr, "%d", (int)pid);
} else {
/* if the pid is negative, then we couldn't get a real pid
* to report here - so tell someone that
*/
pidstr = strdup("unknown");
}
if (WIFSIGNALED(status)) { /* died on signal */
#ifdef WCOREDUMP
if (WCOREDUMP(status)) {
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
pidstr, WTERMSIG(status));
sts = WTERMSIG(status);
} else {
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
pidstr, WTERMSIG(status));
sts = WTERMSIG(status);
}
#else
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
pidstr, WTERMSIG(status));
sts = WTERMSIG(status);
#endif /* WCOREDUMP */
} else {
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
pidstr, WEXITSTATUS(status));
sts = WEXITSTATUS(status);
}
orted_failed_launch = true;
free(pidstr);
}
PROCESS:
/* Set the job state as indicated so orterun's exit status
will be non-zero
*/
/* find the job's data record */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
goto WAKEUP;
}
/* set the state */
jdata->state = state;
WAKEUP:
/* set orterun's exit code and wakeup so it can exit */
ORTE_UPDATE_EXIT_STATUS(sts);
orte_trigger_event(&orte_exit);
}
static void process_orted_launch_report(int fd, short event, void *data)
{
orte_message_event_t *mev = (orte_message_event_t*)data;
@ -522,9 +448,6 @@ static void process_orted_launch_report(int fd, short event, void *data)
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
if (orte_timing_details) {
}
/* save the latest daemon to start */
if (startsec > daemonlaunchtime.tv_sec) {
@ -654,7 +577,9 @@ CLEANUP:
if (orted_failed_launch) {
if( NULL != rml_uri ) free(rml_uri);
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_FAILED_TO_START,
ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orted_num_callback++;
}
@ -722,15 +647,19 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
/* if we are timing, output the results */
if (orte_timing) {
int64_t sec, usec;
char *tmpstr;
ORTE_COMPUTE_TIME_DIFF(sec, usec, orte_plm_globals.daemonlaunchstart.tv_sec,
orte_plm_globals.daemonlaunchstart.tv_usec,
daemonlaunchtime.tv_sec, daemonlaunchtime.tv_usec);
fprintf(orte_timing_output, "Daemon launch was completed in %s\n",
pretty_print_timing(sec, usec));
fprintf(orte_timing_output, "Daemon setup (from first exec statement to ready-for-commands) was completed in a maximum of %s\n",
pretty_print_timing(daemonsetuptime.tv_sec, daemonsetuptime.tv_usec));
fprintf(orte_timing_output, "Daemon callback message to HNP took a maximum time of %s to reach the HNP\n",
pretty_print_timing(daemoncbtime.tv_sec, daemoncbtime.tv_usec));
tmpstr = orte_pretty_print_timing(sec, usec);
fprintf(orte_timing_output, "Daemon launch was completed in %s\n", tmpstr);
free(tmpstr);
tmpstr = orte_pretty_print_timing(daemonsetuptime.tv_sec, daemonsetuptime.tv_usec);
fprintf(orte_timing_output, "Daemon setup (from first exec statement to ready-for-commands) was completed in a maximum of %s\n", tmpstr);
free(tmpstr);
tmpstr = orte_pretty_print_timing(daemoncbtime.tv_sec, daemoncbtime.tv_usec);
fprintf(orte_timing_output, "Daemon callback message to HNP took a maximum time of %s to reach the HNP\n", tmpstr);
free(tmpstr);
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
@ -745,345 +674,6 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
return ORTE_SUCCESS;
}
/* the daemons actually report back that their procs have launched. Each
* daemon will only send one message that contains the launch result
* for their local procs.
*/
static bool app_launch_failed;
static struct timeval max_daemon_launch_msg_recvd = {0,0};
static orte_vpid_t num_daemons_reported=0;
static opal_event_t *dmn_report_ev=NULL;
/* catch timeout to allow cmds to progress */
static void timer_cb(int fd, short event, void *cbdata)
{
/* free event */
if (NULL != dmn_report_ev) {
free(dmn_report_ev);
dmn_report_ev = NULL;
}
/* declare time is up */
app_launch_failed = true;
}
/* since the HNP also reports launch of procs, we need to separate out
* the processing of the message vs its receipt so that the HNP
* can call the processing part directly
*/
void orte_plm_base_app_report_launch(int fd, short event, void *data)
{
orte_message_event_t *mev = (orte_message_event_t*)data;
opal_buffer_t *buffer = mev->buffer;
orte_std_cntr_t cnt;
orte_jobid_t jobid;
orte_vpid_t vpid;
orte_proc_state_t state;
orte_exit_code_t exit_code;
pid_t pid;
orte_job_t *jdata;
orte_proc_t *proc;
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:app_report_launch from daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mev->sender)));
/* got a response - cancel the timer */
if (NULL != dmn_report_ev) {
opal_event_del(dmn_report_ev);
free(dmn_report_ev);
dmn_report_ev = NULL;
}
/* unpack the jobid being reported */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
orte_errmgr.incomplete_start(-1, -1); /* no way to know the jobid or exit code */
return;
}
/* if the jobid is invalid, then we know that this is a failed
* launch report from before we could even attempt to launch the
* procs - most likely, while we were attempting to unpack the
* launch cmd itself. In this case, just abort
*/
if (ORTE_JOBID_INVALID == jobid) {
jdata = NULL;
app_launch_failed = true;
goto CLEANUP;
}
num_daemons_reported++;
/* get the job data object */
if (NULL == (jdata = orte_get_job_data_object(jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
app_launch_failed = true;
goto CLEANUP;
}
/* if we are timing, the daemon will have included the time it
* recvd the launch msg - the maximum time between when we sent
* that message and a daemon recvd it tells us the time reqd
* to wireup the daemon comm network
*/
if (orte_timing) {
int64_t tmpsec, tmpusec;
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpsec, &cnt, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
goto CLEANUP;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpusec, &cnt, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
goto CLEANUP;
}
/* keep the maximum time */
if (tmpsec > max_daemon_launch_msg_recvd.tv_sec) {
max_daemon_launch_msg_recvd.tv_sec = tmpsec;
max_daemon_launch_msg_recvd.tv_usec = tmpusec;
} else if (tmpsec == max_daemon_launch_msg_recvd.tv_sec &&
tmpusec > max_daemon_launch_msg_recvd.tv_usec) {
max_daemon_launch_msg_recvd.tv_usec = tmpusec;
}
if (orte_timing_details) {
int64_t sec, usec;
ORTE_COMPUTE_TIME_DIFF(sec, usec, launch_msg_sent.tv_sec, launch_msg_sent.tv_usec,
tmpsec, tmpusec);
fprintf(orte_timing_output, "Time for launch msg to reach daemon %s: %s\n",
ORTE_VPID_PRINT(mev->sender.vpid), pretty_print_timing(sec, usec));
}
}
/* the daemon will report the vpid, state, and pid of each
* process it launches - we need the pid in particular so
* that any debuggers can attach to the process
*/
cnt = 1;
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) {
if (ORTE_VPID_INVALID == vpid) {
/* flag indicating we are done */
break;
}
/* unpack the pid */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &cnt, OPAL_PID))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
goto CLEANUP;
}
/* if we are timing things, unpack the time this proc was started */
if (orte_timing) {
int64_t tmpsec, tmpusec;
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpsec, &cnt, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
goto CLEANUP;
}
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &tmpusec, &cnt, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
goto CLEANUP;
}
if (orte_timing_details) {
time_t tmptime;
char *tmpstr;
tmptime = tmpsec;
tmpstr = ctime(&tmptime);
/* remove the newline and the year at the end */
tmpstr[strlen(tmpstr)-6] = '\0';
fprintf(orte_timing_output, "Time rank %s was launched: %s.%3lu\n",
ORTE_VPID_PRINT(vpid), tmpstr, (unsigned long)(tmpusec/1000));
}
}
/* unpack the state */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &cnt, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
goto CLEANUP;
}
/* unpack the exit code */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &cnt, ORTE_EXIT_CODE))) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
goto CLEANUP;
}
/* lookup the proc and update values */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
app_launch_failed = true;
goto CLEANUP;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:app_report_launched for proc %s from daemon %s: pid %lu state %0x exit %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(proc->name)),
ORTE_NAME_PRINT(&mev->sender), (unsigned long)pid,
(int)state, (int)exit_code));
proc->pid = pid;
proc->state = state;
proc->exit_code = exit_code;
if (ORTE_PROC_STATE_FAILED_TO_START == state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:app_report_launched daemon %s reports proc %s failed to start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mev->sender),
ORTE_NAME_PRINT(&(proc->name))));
if (NULL == jdata->aborted_proc) {
jdata->aborted_proc = proc; /* only store this once */
jdata->state = ORTE_JOB_STATE_FAILED_TO_START; /* update the job state */
}
/* increment the terminated counter */
jdata->num_terminated++;
/* ensure we have a non-zero exit code */
if (0 == jdata->aborted_proc->exit_code) {
jdata->aborted_proc->exit_code = ORTE_ERROR_DEFAULT_EXIT_CODE;
}
app_launch_failed = true;
goto CLEANUP;
}
/* record that a proc reported */
jdata->num_launched++;
}
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
if (orte_report_launch_progress) {
if (0 == num_daemons_reported % 100 || num_daemons_reported == orte_process_info.num_procs) {
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
(int)num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
}
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:app_report_launch completed processing",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
CLEANUP:
if (app_launch_failed) {
if (NULL == jdata) {
orte_errmgr.incomplete_start(ORTE_JOBID_INVALID, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else if (NULL == jdata->aborted_proc) {
orte_errmgr.incomplete_start(jdata->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
}
} else {
/* restart the timer, if necessary */
if (NULL != jdata && jdata->num_launched < jdata->num_procs && 0 < orte_startup_timeout) {
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
}
}
}
static void app_report_launch(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
int rc;
/* don't process this right away - we need to get out of the recv before
* we process the message as it may ask us to do something that involves
* more messaging! Instead, setup an event so that the message gets processed
* as soon as we leave the recv.
*
* The macro makes a copy of the buffer, which we release when processed - the incoming
* buffer, however, is NOT released here, although its payload IS transferred
* to the message buffer for later processing
*/
ORTE_MESSAGE_EVENT(sender, buffer, tag, orte_plm_base_app_report_launch);
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:app_report_launch reissuing non-blocking recv",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* reissue the non-blocking receive */
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_APP_LAUNCH_CALLBACK,
ORTE_RML_NON_PERSISTENT, app_report_launch, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
app_launch_failed = true;
}
}
int orte_plm_base_report_launched(orte_jobid_t job)
{
int rc;
orte_job_t *jdata;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:report_launched for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
/* get the job data object */
if (NULL == (jdata = orte_get_job_data_object(job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* setup a timer - if we don't hear back from a daemon in the
* defined time, then we know things have failed
*/
if (0 < orte_startup_timeout) {
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
}
/* we should get a callback from every daemon that is involved in
* the launch. Fortunately, the mapper keeps track of this number
* for us since num_nodes = num_participating_daemons
*/
app_launch_failed = false;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_APP_LAUNCH_CALLBACK,
ORTE_RML_NON_PERSISTENT, app_report_launch, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
return rc;
}
ORTE_PROGRESSED_WAIT(app_launch_failed, jdata->num_launched, jdata->num_procs);
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:report_launched all apps reported",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* cancel the lingering recv */
if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_APP_LAUNCH_CALLBACK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* declare the job to be launched, but check to ensure
* the procs haven't already reported in to avoid setting the
* job back to an earlier state
*/
if (jdata->state < ORTE_JOB_STATE_LAUNCHED) {
jdata->state = ORTE_JOB_STATE_LAUNCHED;
} else if (ORTE_JOB_STATE_TERMINATED < jdata->state) {
/* job failed - indicate so */
return ORTE_ERR_FATAL;
}
return ORTE_SUCCESS;
}
int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
{
int i, loc;
@ -1322,413 +912,3 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
return ORTE_SUCCESS;
}
static void process_check_job_completed(int fd, short event, void *data)
{
orte_job_t *jdata = (orte_job_t*)data;
active_job_completed_callback = false;
orte_plm_base_check_job_completed(jdata);
return;
}
void orte_plm_base_check_job_completed(orte_job_t *jdata)
{
orte_proc_t *proc;
int i;
orte_std_cntr_t j;
orte_job_t *job;
orte_node_t *node;
orte_job_map_t *map;
orte_std_cntr_t index;
bool one_still_alive;
/* if the incoming job data pointer is NULL, then all we can do
* is check all jobs for complete
*/
if (NULL == jdata) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed called with NULL pointer",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto CHECK_ALL_JOBS;
}
/* if this job is not to be monitored, then ignore it */
if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & jdata->controls) {
return;
}
/* Check if FileM is active. If so then keep processing. */
if( orte_filem_base_is_active ) {
opal_event_t *ev = NULL;
struct timeval delay;
if( active_job_completed_callback ) {
return;
}
active_job_completed_callback = true;
ev = (opal_event_t*)malloc(sizeof(opal_event_t));
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"WARNING: FileM Still Active! Waiting for it to finish..."));
opal_evtimer_set(ev, process_check_job_completed, jdata);
delay.tv_sec = 5;
delay.tv_usec = 0;
opal_evtimer_add(ev, &delay);
return;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed for job %s - num_terminated %lu num_procs %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
(unsigned long)jdata->num_terminated,
(unsigned long)jdata->num_procs));
/* if this job was ordered to abort, or if its state was already recorded
* as abnormally terminated, then do not update its state
*
* Treat termination of any process in a continuously operating job as
* an error unless it was specifically commanded
*/
if (jdata->state < ORTE_JOB_STATE_TERMINATED ||
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP ||
jdata->controls & ORTE_JOB_CONTROL_RECOVERABLE) {
for (i=0; i < jdata->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
/* the proc array may no longer be left justified, so
* we need to check everything
*/
continue;
}
/*
* Determine how the process state affects the job state
*/
if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed proc %s failed to start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
if (!jdata->abort) {
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
} else if (ORTE_PROC_STATE_ABORTED == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed proc %s aborted",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
jdata->state = ORTE_JOB_STATE_ABORTED;
if (!jdata->abort) {
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
} else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed proc %s aborted by signal",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
if (!jdata->abort) {
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
} else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed proc %s terminated without sync",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
if (!jdata->abort) {
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
/* now treat a special case - if the proc exit'd without a required
* sync, it may have done so with a zero exit code. We want to ensure
* that the user realizes there was an error, so in this -one- case,
* we overwrite the process' exit code with the default error code
*/
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
} else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed proc %s killed by cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
/* we ordered this proc to die, so it isn't an abnormal termination
* and we don't flag it as such - just check the remaining jobs to
* see if anyone is still alive
*/
if (jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated - now we need to check to see if ALL
* the other jobs have also completed and wakeup if that is true
*/
jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD;
}
goto CHECK_ALL_JOBS;
} else if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed proc %s terminated and continuous",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
proc->state = ORTE_PROC_STATE_ABORTED;
jdata->state = ORTE_JOB_STATE_ABORTED;
if (!jdata->abort) {
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
}
/*
* Call the errmgr for this process, if necessary
*/
if (ORTE_PROC_STATE_ABORTED == proc->state ||
ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state ||
ORTE_PROC_STATE_TERM_WO_SYNC == proc->state ||
ORTE_PROC_STATE_KILLED_BY_CMD == proc->state ) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed "
"Declared job %s %s by proc %s with code %d (0x%x vs 0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
(jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD ?
"killed by cmd" : "aborted"),
ORTE_NAME_PRINT(&(proc->name)),
proc->exit_code,
proc->last_errmgr_state, proc->state));
/* Only report escalations in the fault state */
if( proc->last_errmgr_state < proc->state ) {
proc->last_errmgr_state = proc->state;
orte_errmgr.proc_aborted(&(proc->name), proc->exit_code);
}
}
}
}
/* check the resulting job state and notify the appropriate places */
if (ORTE_JOB_STATE_FAILED_TO_START == jdata->state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed declared job %s failed to start by proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
(NULL == jdata->aborted_proc) ? "unknown" : ORTE_NAME_PRINT(&(jdata->aborted_proc->name))));
/* report this to the errmgr - it will protect us from multiple calls */
if (NULL == jdata->aborted_proc) {
/* we don't know who caused us to abort */
orte_errmgr.incomplete_start(jdata->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
}
goto CHECK_ALL_JOBS;
} else if (ORTE_JOB_STATE_ABORTED == jdata->state ||
ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state ||
ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state ||
ORTE_JOB_STATE_KILLED_BY_CMD == jdata->state ) {
/* report this to the errmgr
* (if we know which process caused this, then it was reported above)
*/
if (NULL == jdata->aborted_proc) {
/* we don't know who caused us to abort */
orte_errmgr.proc_aborted(ORTE_NAME_INVALID, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
goto CHECK_ALL_JOBS;
} else if (jdata->num_terminated >= jdata->num_procs) {
/* this job has terminated - now we need to check to see if ALL
* the other jobs have also completed and wakeup if that is true
*/
jdata->state = ORTE_JOB_STATE_TERMINATED;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed declared job %s normally terminated - checking all jobs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
CHECK_ALL_JOBS:
/* if this job is a continuously operating one, then don't do
* anything further - just return here
*/
if (NULL != jdata &&
(ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls ||
ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls) ) {
goto CHECK_ALIVE;
}
/* if the job that is being checked is the HNP, then we are
* trying to terminate the orteds. In that situation, we
* do -not- check all jobs - we simply notify the HNP
* that the orteds are complete. Also check special case
* if jdata is NULL - we want
* to definitely declare the job done if the orteds
* have completed, no matter what else may be happening.
* This can happen if a ctrl-c hits in the "wrong" place
* while launching
*/
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (jdata->num_terminated >= jdata->num_procs) {
/* orteds are done! */
jdata->state = ORTE_JOB_STATE_TERMINATED;
orte_trigger_event(&orteds_exit);
return;
}
}
/* Release the resources used by this job. Since some errmgrs may want
* to continue using resources allocated to the job as part of their
* fault recovery procedure, we only do this once the job is "complete".
* Note that an aborted/killed job -is- flagged as complete and will
* therefore have its resources released. We need to do this after
* we call the errmgr so that any attempt to restart the job will
* avoid doing so in the exact same place as the current job
*/
if( NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
map = jdata->map;
for( index = 0; index < map->nodes->size; index++ ) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s releasing procs from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
for( i = 0; i < node->procs->size; i++ ) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (proc->name.jobid != jdata->jobid) {
/* skip procs from another job */
continue;
}
node->slots_inuse--;
node->num_procs--;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s releasing proc %s from node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), node->name));
/* set the entry in the node array to NULL */
opal_pointer_array_set_item(node->procs, i, NULL);
/* release the proc once for the map entry */
OBJ_RELEASE(proc);
}
}
OBJ_RELEASE(map);
jdata->map = NULL;
}
CHECK_ALIVE:
/* now check to see if all jobs are done - release this jdata
* object when we find it
*/
one_still_alive = false;
for (j=1; j < orte_job_data->size; j++) {
if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
/* since we are releasing jdata objects as we
* go, we can no longer assume that the job_data
* array is left justified
*/
continue;
}
/* if this is the job we are checking AND it normally terminated,
* then go ahead and release it. We cannot release it if it
* abnormally terminated as mpirun needs the info so it can
* report appropriately to the user
*/
if (NULL != jdata && job->jobid == jdata->jobid &&
(jdata->state == ORTE_JOB_STATE_TERMINATED ||
jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) {
/* release this object, ensuring that the
* pointer array internal accounting
* is maintained!
*/
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
OBJ_RELEASE(jdata);
continue;
}
/* if the job is flagged to not be monitored, skip it */
if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) {
continue;
}
/* when checking for job termination, we must be sure to NOT check
* our own job as it - rather obviously - has NOT terminated!
*/
if (job->num_terminated < job->num_procs) {
/* we have at least one job that is not done yet - we cannot
* just return, though, as we need to ensure we cleanout the
* job data for the job that just completed
*/
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed job %s is not terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid)));
one_still_alive = true;
}
else {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed job %s is terminated (%d vs %d [0x%x])",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job->jobid),
job->num_terminated, job->num_procs, jdata->state ));
}
}
/* if a job is still alive, we just return */
if (one_still_alive) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed at least one job is not terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return;
}
/* if we get here, then all jobs are done, so wakeup */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed all jobs terminated - waking up",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* set the exit status to 0 - this will only happen if it
* wasn't already set by an error condition
*/
ORTE_UPDATE_EXIT_STATUS(0);
orte_trigger_event(&orte_exit);
}
}
static char timestring[128];
static char *pretty_print_timing(int64_t secs, int64_t usecs)
{
unsigned long minutes, seconds;
float fsecs;
seconds = secs + (usecs / 1000000l);
minutes = seconds / 60l;
seconds = seconds % 60l;
if (0 == minutes && 0 == seconds) {
fsecs = ((float)(secs)*1000000.0 + (float)usecs) / 1000.0;
snprintf(timestring, 128, "%8.2f millisecs", fsecs);
} else {
snprintf(timestring, 128, "%3lu:%02lu min:sec", minutes, seconds);
}
return timestring;
}

Просмотреть файл

@ -35,6 +35,7 @@
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/dss/dss.h"
#include "opal/threads/threads.h"
#include "orte/constants.h"
#include "orte/types.h"
@ -55,6 +56,7 @@
static bool recv_issued=false;
static opal_mutex_t lock;
static opal_condition_t cond;
static opal_list_t recvs;
static opal_event_t ready;
static int ready_fd[2];
@ -76,6 +78,7 @@ int orte_plm_base_comm_start(void)
processing = false;
OBJ_CONSTRUCT(&lock, opal_mutex_t);
OBJ_CONSTRUCT(&cond, opal_condition_t);
OBJ_CONSTRUCT(&recvs, opal_list_t);
#ifndef __WINDOWS__
pipe(ready_fd);
@ -146,16 +149,16 @@ static void process_msg(int fd, short event, void *data)
orte_app_context_t *app, *child_app;
opal_list_item_t *item;
int dump[128];
orte_process_name_t name;
pid_t pid;
bool running;
OPAL_THREAD_LOCK(&lock);
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive processing msg",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* tag that we are processing the list */
processing = true;
/* clear the file descriptor to stop the event from refiring */
#ifndef __WINDOWS__
read(fd, &dump, sizeof(dump));
@ -163,6 +166,9 @@ static void process_msg(int fd, short event, void *data)
recv(fd, (char *) &dump, sizeof(dump), 0);
#endif
/* reset the event for the next message */
opal_event_add(&ready, 0);
while (NULL != (item = opal_list_remove_first(&recvs))) {
msgpkt = (orte_msg_packet_t*)item;
@ -191,6 +197,9 @@ static void process_msg(int fd, short event, void *data)
/* if is a LOCAL slave cmd */
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive local launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* In this case, I cannot lookup job info. All I do is pass
* this along to the local launcher, IF it is available
*/
@ -226,6 +235,10 @@ static void process_msg(int fd, short event, void *data)
child_app->prefix_dir = strdup(app->prefix_dir);
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive adding hosts",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* process any add-hostfile and add-host options that were provided */
if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
ORTE_ERROR_LOG(rc);
@ -247,10 +260,16 @@ static void process_msg(int fd, short event, void *data)
}
/* launch it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive calling spawn",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH;
}
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
job = jdata->jobid;
/* output debugger proctable, if requested */
@ -272,7 +291,18 @@ static void process_msg(int fd, short event, void *data)
/* if the child is an ORTE job, wait for the procs to report they are alive */
if (!(jdata->controls & ORTE_JOB_CONTROL_NON_ORTE_JOB)) {
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive waiting for procs to report",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
/* we will wait here until the thread is released,
* indicating that all procs have reported
*/
OPAL_ACQUIRE_THREAD(&jdata->reported_lock,
&jdata->reported_cond,
&jdata->not_reported);
OPAL_THREAD_UNLOCK(&jdata->reported_lock);
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
}
ANSWER_LAUNCH:
@ -298,7 +328,6 @@ static void process_msg(int fd, short event, void *data)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(msgpkt->sender)) ));
count = 1;
jdata = NULL;
while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
@ -306,12 +335,48 @@ static void process_msg(int fd, short event, void *data)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
/* lookup the job object */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* this job may already have been removed from the array, so just cleanly
* ignore this request
*/
goto CLEANUP;
name.jobid = job;
running = true;
/* if we are timing, the daemon will have included the time it
* recvd the launch msg - the maximum time between when we sent
* that message and a daemon recvd it tells us the time reqd
* to wireup the daemon comm network
*/
if (orte_timing) {
int64_t tmpsec, tmpusec;
/* get the job object */
if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto CLEANUP;
}
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* keep the maximum time */
if (tmpsec > jdata->max_launch_msg_recvd.tv_sec) {
jdata->max_launch_msg_recvd.tv_sec = tmpsec;
jdata->max_launch_msg_recvd.tv_usec = tmpusec;
} else if (tmpsec == jdata->max_launch_msg_recvd.tv_sec &&
tmpusec > jdata->max_launch_msg_recvd.tv_usec) {
jdata->max_launch_msg_recvd.tv_usec = tmpusec;
}
if (orte_timing_details) {
int64_t sec, usec;
char *timestr;
ORTE_COMPUTE_TIME_DIFF(sec, usec, jdata->launch_msg_sent.tv_sec, jdata->launch_msg_sent.tv_usec,
tmpsec, tmpusec);
timestr = orte_pretty_print_timing(sec, usec);
fprintf(orte_timing_output, "Time for launch msg to reach daemon %s: %s\n",
ORTE_VPID_PRINT(msgpkt->sender.vpid), timestr);
free(timestr);
}
}
count = 1;
while (ORTE_SUCCESS == (rc = opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID))) {
@ -319,12 +384,46 @@ static void process_msg(int fd, short event, void *data)
/* flag indicates that this job is complete - move on */
break;
}
name.vpid = vpid;
/* unpack the pid */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &pid, &count, OPAL_PID))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* if we are timing things, unpack the time this proc was started */
if (orte_timing) {
int64_t tmpsec, tmpusec;
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpsec, &count, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &tmpusec, &count, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (orte_timing_details) {
time_t tmptime;
char *tmpstr;
tmptime = tmpsec;
tmpstr = ctime(&tmptime);
/* remove the newline and the year at the end */
tmpstr[strlen(tmpstr)-6] = '\0';
fprintf(orte_timing_output, "Time rank %s was launched: %s.%3lu\n",
ORTE_VPID_PRINT(vpid), tmpstr, (unsigned long)(tmpusec/1000));
}
}
/* unpack the state */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &state, &count, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_PROC_STATE_RUNNING != state) {
running = false;
}
/* unpack the exit code */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
@ -337,36 +436,9 @@ static void process_msg(int fd, short event, void *data)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(unsigned long)vpid, (unsigned int)state, (int)exit_code));
/* retrieve the proc object */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
/* this proc is no longer in table - skip it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive proc %s is not in proc table",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(vpid)));
continue;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive updating state for proc %s current state %x new state %x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
(unsigned int)proc->state, (unsigned int)state));
/* update the termination counter IFF the state is changing to something
* indicating terminated
*/
if (ORTE_PROC_STATE_UNTERMINATED < state &&
ORTE_PROC_STATE_UNTERMINATED > proc->state) {
++jdata->num_terminated;
}
/* update the data */
proc->state = state;
proc->exit_code = exit_code;
/* update orte's exit status if it is non-zero */
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* update the state */
orte_errmgr.update_state(job, ORTE_JOB_STATE_UNDEF,
&name, state, exit_code);
}
count = 1;
}
@ -375,14 +447,14 @@ static void process_msg(int fd, short event, void *data)
} else {
rc = ORTE_SUCCESS;
}
/* NOTE: jdata CAN BE NULL. This is caused by an orted
* being ordered to kill all its procs, but there are no
* procs left alive on that node. This can happen, for example,
* when a proc aborts somewhere, but the procs on this node
* have completed.
* So check job has to know how to handle a NULL pointer
*/
orte_plm_base_check_job_completed(jdata);
if (orte_report_launch_progress && running) {
jdata->num_daemons_reported++;
if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) {
opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs",
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
}
}
break;
case ORTE_PLM_HEARTBEAT_CMD:
@ -408,6 +480,33 @@ static void process_msg(int fd, short event, void *data)
proc->beat = beat.tv_sec;
break;
case ORTE_PLM_INIT_ROUTES_CMD:
count=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
name.jobid = job;
count=1;
while (ORTE_SUCCESS == opal_dss.unpack(msgpkt->buffer, &vpid, &count, ORTE_VPID)) {
if (ORTE_VPID_INVALID == vpid) {
break;
}
name.vpid = vpid;
/* update the errmgr state */
orte_errmgr.update_state(job, ORTE_JOB_STATE_REGISTERED,
&name, ORTE_PROC_STATE_REGISTERED,
ORTE_ERROR_DEFAULT_EXIT_CODE);
count=1;
}
/* pass the remainder of the buffer to the active module's
* init_routes API
*/
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
ORTE_ERROR_LOG(rc);
}
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
@ -423,13 +522,9 @@ static void process_msg(int fd, short event, void *data)
}
}
/* reset the event */
processing = false;
opal_event_add(&ready, 0);
DEPART:
/* release the thread */
OPAL_THREAD_UNLOCK(&lock);
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
/* see if an error occurred - if so, wakeup the HNP so we can exit */
if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {

Просмотреть файл

@ -106,8 +106,6 @@ ORTE_DECLSPEC int orte_plm_base_set_progress_sched(int sched);
*/
ORTE_DECLSPEC int orte_plm_base_setup_job(orte_job_t *jdata);
ORTE_DECLSPEC int orte_plm_base_launch_apps(orte_jobid_t job);
ORTE_DECLSPEC void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid, int status, orte_job_state_t state);
ORTE_DECLSPEC int orte_plm_base_report_launched(orte_jobid_t job);
ORTE_DECLSPEC int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons);

Просмотреть файл

@ -576,7 +576,9 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* check for timing request - get stop time and process if so */

Просмотреть файл

@ -362,7 +362,9 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;

Просмотреть файл

@ -45,22 +45,22 @@ typedef uint16_t orte_proc_state_t;
#define ORTE_PROC_STATE_RESTART 0x0002 /* the proc is ready for restart */
#define ORTE_PROC_STATE_LAUNCHED 0x0004 /* process has been launched */
#define ORTE_PROC_STATE_RUNNING 0x0010 /* daemon has locally fork'd process */
#define ORTE_PROC_STATE_REGISTERED 0x0020 /* process has registered for sync */
/*
* Define a "boundary" so we can easily and quickly determine
* if a proc is still running or not - any value less than
* this one means that we are not terminated
*/
#define ORTE_PROC_STATE_UNTERMINATED 0x0020
#define ORTE_PROC_STATE_UNTERMINATED 0x0040
#define ORTE_PROC_STATE_TERMINATED 0x0080 /* process has terminated and is no longer running */
#define ORTE_PROC_STATE_ABORTED 0x0100 /* process aborted */
#define ORTE_PROC_STATE_FAILED_TO_START 0x0200 /* process failed to start */
#define ORTE_PROC_STATE_ABORTED_BY_SIG 0x0400 /* process aborted by signal */
#define ORTE_PROC_STATE_TERM_WO_SYNC 0x0800 /* process exit'd w/o required sync */
#define ORTE_PROC_STATE_KILLED_BY_CMD 0x1000 /* process was killed by ORTE cmd */
#define ORTE_PROC_STATE_KILLED_BY_CMD 0x0100 /* process was killed by ORTE cmd */
#define ORTE_PROC_STATE_ABORTED 0x0200 /* process aborted */
#define ORTE_PROC_STATE_FAILED_TO_START 0x0400 /* process failed to start */
#define ORTE_PROC_STATE_ABORTED_BY_SIG 0x0800 /* process aborted by signal */
#define ORTE_PROC_STATE_TERM_WO_SYNC 0x1000 /* process exit'd w/o required sync */
#define ORTE_PROC_STATE_COMM_FAILED 0x2000 /* process communication has failed */
/*
* Job state codes
*/
@ -72,8 +72,9 @@ typedef uint16_t orte_job_state_t;
#define ORTE_JOB_STATE_INIT 0x0001 /* job entry has been created by rmaps */
#define ORTE_JOB_STATE_RESTART 0x0002 /* the job is ready for restart after one or more procs failed */
#define ORTE_JOB_STATE_LAUNCHED 0x0004 /* job has been launched by plm */
#define ORTE_JOB_STATE_RUNNING 0x0010 /* all process have been fork'd */
#define ORTE_JOB_STATE_SUSPENDED 0x0020 /* job has been suspended */
#define ORTE_JOB_STATE_RUNNING 0x0008 /* all process have been fork'd */
#define ORTE_JOB_STATE_SUSPENDED 0x0010 /* job has been suspended */
#define ORTE_JOB_STATE_REGISTERED 0x0020 /* all procs registered for sync */
/*
* Define a "boundary" so we can easily and quickly determine
* if a job is still running or not - any value less than
@ -87,11 +88,12 @@ typedef uint16_t orte_job_state_t;
#define ORTE_JOB_STATE_ABORTED_BY_SIG 0x0400 /* job was killed by a signal */
#define ORTE_JOB_STATE_ABORTED_WO_SYNC 0x0800 /* job was aborted because proc exit'd w/o required sync */
#define ORTE_JOB_STATE_KILLED_BY_CMD 0x1000 /* job was killed by ORTE cmd */
#define ORTE_JOB_STATE_COMM_FAILED 0x2000 /* communication has failed */
/* the job never even attempted to launch due to an error earlier in the
* launch procedure
*/
#define ORTE_JOB_NEVER_LAUNCHED 0x2000
#define ORTE_JOB_STATE_NEVER_LAUNCHED 0x4000
/* the processes in this job have been ordered to "die", but may not have completed it yet. Don't order it again */
#define ORTE_JOB_STATE_ABORT_ORDERED 0x8000
@ -126,7 +128,7 @@ typedef uint8_t orte_plm_cmd_flag_t;
#define ORTE_PLM_LAUNCH_JOB_CMD 1
#define ORTE_PLM_UPDATE_PROC_STATE 2
#define ORTE_PLM_HEARTBEAT_CMD 3
#define ORTE_PLM_INIT_ROUTES_CMD 4
END_C_DECLS

Просмотреть файл

@ -924,7 +924,9 @@ static void orte_plm_process_wait_daemon(pid_t pid, int status, void* cbdata)
/* report that the daemon has failed so we break out of the daemon
* callback receive and can exit
*/
orte_plm_base_launch_failed(active_job, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(active_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
status);
} /* if abnormal exit */
/* release any waiting threads */
@ -1364,7 +1366,9 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if( failed_launch ) {
orte_plm_base_launch_failed(jdata->jobid, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
orte_errmgr.update_state(jdata->jobid, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;

Просмотреть файл

@ -325,7 +325,8 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
jdata->num_terminated++;
#if 0
/* report that the daemon has failed so we can exit */
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF, status);
#else
/* JJH: Look into a better way of doing this. If we let the daemon
* know, then it kills the job when we are trying to restart.. */
@ -1029,7 +1030,9 @@ CLEANUP:
if (orted_failed_launch) {
if( NULL != rml_uri ) free(rml_uri);
orte_errmgr.incomplete_start(peer.jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_errmgr.update_state(peer.jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_FAILED_TO_START,
ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orted_num_callback++;
}
@ -1058,7 +1061,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
orte_node_t *node;
orte_std_cntr_t nnode;
orte_jobid_t failed_job;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED;
bool recv_issued = false;
/* wait for the launch to complete */
@ -1414,7 +1417,9 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* cancel the lingering recv */

Просмотреть файл

@ -147,15 +147,11 @@ static void wait_cb(pid_t pid, int status, void* cbdata)
"%s proc %d failed with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)proc->name.vpid, WEXITSTATUS(status)));
/* note that this daemon failed */
proc->state = ORTE_PROC_STATE_ABORTED;
}
/* increment the #procs terminated so we will exit properly */
jdata->num_terminated++;
/* note that this daemon failed */
orte_errmgr.update_state(proc->name.jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_FAILED_TO_START, status);
/* check for job completion */
orte_plm_base_check_job_completed(jdata);
/* release any waiting threads */
OPAL_THREAD_LOCK(&mca_plm_rshd_component.lock);
@ -252,7 +248,7 @@ int orte_plm_rshd_launch(orte_job_t *jdata)
orte_node_t *node;
orte_proc_t *proc;
orte_jobid_t failed_job = ORTE_JOBID_INVALID;
orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED;
pid_t pid;
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
@ -378,7 +374,9 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* setup a "heartbeat" timer to periodically check on

Просмотреть файл

@ -427,6 +427,7 @@ launch_apps:
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &launch, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
OBJ_DESTRUCT(&launch);
#if 0
if (ORTE_SUCCESS != (rc = orte_plm_base_report_launched(jdata->jobid))) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:slurm:launch failed for job %s on error %s",
@ -434,6 +435,7 @@ launch_apps:
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
#endif
} else {
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
@ -476,7 +478,9 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;
@ -576,7 +580,8 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: daemon failed during launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF, status);
} else {
/* if this is after launch, then we need to abort only if the status
* returned is non-zero - i.e., if the orteds exited with an error
@ -588,7 +593,8 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: daemon failed while running",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED);
orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_ABORTED,
NULL, ORTE_PROC_STATE_UNDEF, status);
}
/* otherwise, check to see if this is the primary pid */
if (primary_srun_pid == pid) {

Просмотреть файл

@ -299,7 +299,8 @@ static void orte_plm_submit_wait_daemon(pid_t pid, int status, void* cbdata)
Set the job state to indicate we failed to launch so orterun's exit status
will be non-zero and forcibly terminate the job so orterun can exit
*/
orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(active_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF, status);
} /* if abnormal exit */
@ -928,7 +929,9 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(jdata->jobid, false, -1, 0, ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(jdata->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;

Просмотреть файл

@ -460,7 +460,9 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* setup a "heartbeat" timer to periodically check on

Просмотреть файл

@ -547,7 +547,9 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, job_state);
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* setup a "heartbeat" timer to periodically check on

Просмотреть файл

@ -187,9 +187,9 @@ orte_plm_xgrid_spawn(orte_job_t *jdata)
cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_plm_base_launch_failed(jdata->jobid,
-1, ORTE_ERROR_DEFAULT_EXIT_CODE,
ORTE_JOB_STATE_FAILED_TO_START);
orte_errmgr.update_state(jdata->jobid, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
ORTE_ERROR_DEFAULT_EXIT_CODE);
}
return rc;

Просмотреть файл

@ -117,7 +117,6 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
#define ORTE_RML_TAG_WIREUP 8
#define ORTE_RML_TAG_RML_INFO_UPDATE 9
#define ORTE_RML_TAG_ORTED_CALLBACK 10
#define ORTE_RML_TAG_APP_LAUNCH_CALLBACK 11
#define ORTE_RML_TAG_REPORT_REMOTE_LAUNCH 12
#define ORTE_RML_TAG_CKPT 13
@ -128,7 +127,6 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
#define ORTE_RML_TAG_ALLGATHER_LIST 16
#define ORTE_RML_TAG_BARRIER 17
#define ORTE_RML_TAG_INIT_ROUTES 18
#define ORTE_RML_TAG_UPDATE_ROUTE_ACK 19
#define ORTE_RML_TAG_SYNC 20

Просмотреть файл

@ -17,6 +17,5 @@ libmca_routed_la_SOURCES += \
if !ORTE_DISABLE_FULL_SUPPORT
libmca_routed_la_SOURCES += \
base/routed_base_register_sync.c \
base/routed_base_receive.c
base/routed_base_register_sync.c
endif

Просмотреть файл

@ -16,6 +16,7 @@
#include "opal/mca/mca.h"
#include "opal/dss/dss_types.h"
#include "opal/threads/threads.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/routed/routed.h"
@ -35,18 +36,14 @@ ORTE_DECLSPEC int orte_routed_base_close(void);
ORTE_DECLSPEC extern int orte_routed_base_output;
ORTE_DECLSPEC extern opal_list_t orte_routed_base_components;
ORTE_DECLSPEC extern opal_mutex_t orte_routed_base_lock;
ORTE_DECLSPEC extern opal_condition_t orte_routed_base_cond;
ORTE_DECLSPEC extern bool orte_routed_base_wait_sync;
ORTE_DECLSPEC extern int orte_routed_base_register_sync(bool setup);
ORTE_DECLSPEC extern int orte_routed_base_process_callback(orte_jobid_t job,
opal_buffer_t *buffer);
ORTE_DECLSPEC int orte_routed_base_comm_start(void);
ORTE_DECLSPEC int orte_routed_base_comm_stop(void);
ORTE_DECLSPEC extern void orte_routed_base_process_msg(int fd, short event, void *data);
ORTE_DECLSPEC extern void orte_routed_base_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -55,6 +55,9 @@ OBJ_CLASS_INSTANCE(orte_routed_tree_t, opal_list_item_t,
int orte_routed_base_output = -1;
orte_routed_module_t orte_routed = {0};
opal_list_t orte_routed_base_components;
opal_mutex_t orte_routed_base_lock;
opal_condition_t orte_routed_base_cond;
bool orte_routed_base_wait_sync;
static orte_routed_component_t *active_component = NULL;
static bool component_open_called = false;
@ -73,7 +76,10 @@ orte_routed_base_open(void)
/* setup the output stream */
orte_routed_base_output = opal_output_open(NULL);
OBJ_CONSTRUCT(&orte_routed_base_lock, opal_mutex_t);
OBJ_CONSTRUCT(&orte_routed_base_cond, opal_condition_t);
orte_routed_base_wait_sync = false;
/* Initialize globals */
OBJ_CONSTRUCT(&orte_routed_base_components, opal_list_t);
@ -145,7 +151,9 @@ orte_routed_base_close(void)
}
OBJ_DESTRUCT(&orte_routed_base_components);
OBJ_DESTRUCT(&orte_routed_base_lock);
OBJ_DESTRUCT(&orte_routed_base_cond);
opened = false;
selected = false;

Просмотреть файл

@ -1,229 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
/*
* includes
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/dss/dss.h"
#include "opal/util/output.h"
#include "orte/util/proc_info.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/base/base.h"
static bool recv_issued=false;
static opal_mutex_t lock;
static opal_list_t recvs;
static opal_event_t ready;
static int ready_fd[2];
static bool processing;
static void process_msg(int fd, short event, void *data);
int orte_routed_base_comm_start(void)
{
int rc;
if (recv_issued) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:base: Receive: Start command recv",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
processing = false;
OBJ_CONSTRUCT(&lock, opal_mutex_t);
OBJ_CONSTRUCT(&recvs, opal_list_t);
#ifndef __WINDOWS__
pipe(ready_fd);
#else
if (evutil_socketpair(AF_UNIX, SOCK_STREAM, 0, ready_fd) == -1) {
return ORTE_ERROR;
}
#endif
opal_event_set(&ready, ready_fd[0], OPAL_EV_READ, process_msg, NULL);
opal_event_add(&ready, 0);
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_INIT_ROUTES,
ORTE_RML_NON_PERSISTENT,
orte_routed_base_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
}
recv_issued = true;
return rc;
}
int orte_routed_base_comm_stop(void)
{
if (!recv_issued) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:base:receive stop comm",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OBJ_DESTRUCT(&recvs);
opal_event_del(&ready);
#ifndef __WINDOWS__
close(ready_fd[0]);
#else
closesocket(ready_fd[0]);
#endif
processing = false;
OBJ_DESTRUCT(&lock);
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_INIT_ROUTES);
recv_issued = false;
return ORTE_SUCCESS;
}
static void process_msg(int fd, short event, void *data)
{
orte_msg_packet_t *msgpkt;
orte_jobid_t job;
int rc;
orte_std_cntr_t cnt;
opal_list_item_t *item;
int dump[128];
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:base:receive processing msg",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
OPAL_THREAD_LOCK(&lock);
/* tag that we are processing the list */
processing = true;
/* clear the file descriptor to stop the event from refiring */
#ifndef __WINDOWS__
read(fd, &dump, sizeof(dump));
#else
recv(fd, (char *) &dump, sizeof(dump), 0);
#endif
while (NULL != (item = opal_list_remove_first(&recvs))) {
msgpkt = (orte_msg_packet_t*)item;
/* unpack the jobid this is for */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &cnt, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(msgpkt);
continue;
}
/* pass the remainder of the buffer to the active module's
* init_routes API
*/
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
ORTE_ERROR_LOG(rc);
}
OBJ_RELEASE(msgpkt);
}
/* reset the event */
processing = false;
opal_event_add(&ready, 0);
/* release the thread */
OPAL_THREAD_UNLOCK(&lock);
}
/*
* handle init routes requests from non-HNP-local procs
* NOTE: The incoming buffer "buffer" is OBJ_RELEASED by the calling program.
* DO NOT RELEASE THIS BUFFER IN THIS CODE
*/
void orte_routed_base_recv(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:base:receive got message from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* don't process this right away - we need to get out of the recv before
* we process the message as it may ask us to do something that involves
* more messaging! Instead, setup an event so that the message gets processed
* as soon as we leave the recv.
*
* The macro makes a copy of the buffer, which we release above - the incoming
* buffer, however, is NOT released here, although its payload IS transferred
* to the message buffer for later processing
*/
ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], true, sender, &buffer);
/* reissue the recv */
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_INIT_ROUTES,
ORTE_RML_NON_PERSISTENT,
orte_routed_base_recv,
NULL))) {
ORTE_ERROR_LOG(rc);
}
return;
}
/* where HNP messages come */
void orte_routed_base_process_msg(int fd, short event, void *data)
{
orte_message_event_t *mev = (orte_message_event_t*)data;
ORTE_PROCESS_MESSAGE(&recvs, &lock, processing, ready_fd[1], false, &mev->sender, &mev->buffer);
OBJ_RELEASE(mev);
}

Просмотреть файл

@ -22,12 +22,12 @@
#include "orte/types.h"
#include "opal/dss/dss.h"
#include "opal/threads/threads.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/routed/base/base.h"
@ -138,25 +138,14 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
(NULL == rml_uri) ? "NULL" : rml_uri,
ORTE_JOBID_PRINT(job), ORTE_VPID_PRINT(vpid)));
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue;
if (NULL == rml_uri) {
/* should not happen */
ORTE_ERROR_LOG(ORTE_ERR_FATAL);
return ORTE_ERR_FATAL;
}
if (rml_uri == NULL) {
/* if the rml_uri is NULL, then that means this process
* terminated without calling orte_init. However, the only
* reason we would be getting called here is if other
* processes local to that daemon -did- call orte_init.
* This is considered an "abnormal termination" mode per
* community discussion, and must generate a corresponding
* response, so declare the proc abnormally terminated
*/
proc->state = ORTE_PROC_STATE_TERM_WO_SYNC;
/* increment the number of procs that have terminated */
jdata->num_terminated++;
/* let the normal code path declare the job aborted */
orte_plm_base_check_job_completed(jdata);
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue;
}
@ -165,25 +154,14 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
free(rml_uri);
/* update the proc state */
if (proc->state < ORTE_PROC_STATE_RUNNING) {
proc->state = ORTE_PROC_STATE_RUNNING;
}
++jdata->num_reported;
orte_errmgr.update_state(job, ORTE_JOB_STATE_UNDEF,
&proc->name, ORTE_PROC_STATE_RUNNING, 0);
cnt = 1;
}
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if all procs have reported, update our job state */
if (jdata->num_reported == jdata->num_procs) {
/* update the job state */
if (jdata->state < ORTE_JOB_STATE_RUNNING) {
jdata->state = ORTE_JOB_STATE_RUNNING;
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -118,11 +118,6 @@ static int finalize(void)
}
}
/* if I am the HNP, I need to stop the comm recv */
if (ORTE_PROC_IS_HNP) {
orte_routed_base_comm_stop();
}
OBJ_DESTRUCT(&jobfam_list);
/* destruct the global condition and lock */
OBJ_DESTRUCT(&cond);
@ -536,13 +531,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
ORTE_JOBID_PRINT(job)));
if (NULL == ndat) {
/* if ndat is NULL, then this is being called during init, so just
* make myself available to catch any reported contact info
*/
if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* the HNP has no lifeline */
lifeline = NULL;
} else {

Просмотреть файл

@ -117,11 +117,6 @@ static int finalize(void)
}
}
/* if I am the HNP, I need to stop the comm recv */
if (ORTE_PROC_IS_HNP) {
orte_routed_base_comm_stop();
}
cleanup:
OBJ_DESTRUCT(&jobfam_list);
/* destruct the global condition and lock */
@ -524,13 +519,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
ORTE_JOBID_PRINT(job)));
if (NULL == ndat) {
/* if ndat is NULL, then this is being called during init, so just
* make myself available to catch any reported contact info
*/
if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* the HNP has no lifeline */
lifeline = NULL;
} else {
@ -708,7 +696,9 @@ static int route_lost(const orte_process_name_t *route)
opal_output(0, "%s routed:cm: daemon %s has died",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(route->vpid));
orte_errmgr.proc_aborted((orte_process_name_t*)route, 1);
orte_errmgr.update_state(route->jobid, ORTE_JOB_STATE_COMM_FAILED,
(orte_process_name_t*)route,
ORTE_PROC_STATE_COMM_FAILED, 1);
}
/* either way, take no further action */
return ORTE_SUCCESS;

Просмотреть файл

@ -82,11 +82,6 @@ static int finalize(void)
{
int rc;
/* if I am the HNP, I need to stop the comm recv */
if (ORTE_PROC_IS_HNP) {
orte_routed_base_comm_stop();
}
if (ORTE_PROC_IS_MPI && NULL != orte_process_info.my_daemon_uri) {
/* if a daemon launched me, register that I am leaving */
if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(false))) {
@ -222,15 +217,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
if (NULL == ndat) {
/* if ndat is NULL, then this is being called during init, so just
* make myself available to catch any reported contact info
*/
if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
if (NULL != ndat) {
/* if this is for my own jobid, then I am getting an update of RML info
* for the daemons - so update our contact info and routes
*/

Просмотреть файл

@ -109,11 +109,6 @@ static int finalize(void)
}
}
/* if I am the HNP, I need to stop the comm recv */
if (ORTE_PROC_IS_HNP) {
orte_routed_base_comm_stop();
}
OBJ_DESTRUCT(&jobfam_list);
/* destruct the global condition and lock */
OBJ_DESTRUCT(&cond);
@ -493,13 +488,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
ORTE_JOBID_PRINT(job)));
if (NULL == ndat) {
/* if ndat is NULL, then this is being called during init, so just
* make myself available to catch any reported contact info
*/
if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* the HNP has no lifeline */
lifeline = NULL;
} else {

Просмотреть файл

@ -118,11 +118,6 @@ static int finalize(void)
}
}
/* if I am the HNP, I need to stop the comm recv */
if (ORTE_PROC_IS_HNP) {
orte_routed_base_comm_stop();
}
OBJ_DESTRUCT(&jobfam_list);
/* destruct the global condition and lock */
OBJ_DESTRUCT(&cond);
@ -525,13 +520,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
ORTE_JOBID_PRINT(job)));
if (NULL == ndat) {
/* if ndat is NULL, then this is being called during init, so just
* make myself available to catch any reported contact info
*/
if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* the HNP has no lifeline */
lifeline = NULL;
} else {

Просмотреть файл

@ -121,7 +121,7 @@ static void send_relay(opal_buffer_t *buf)
ORTE_VPID_PRINT(nm->vpid)));
target.vpid = nm->vpid;
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(&target,
if (ORTE_SUCCESS != (ret = orte_comm(&target,
buf, ORTE_RML_TAG_DAEMON,
orte_daemon_cmd_processor))) {
ORTE_ERROR_LOG(ret);
@ -688,7 +688,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
goto CLEANUP;
}
/* return response */
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer,
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer,
ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
@ -718,7 +718,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer, tag, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, tag, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);
@ -742,7 +742,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);
@ -813,7 +813,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
}
}
}
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);
@ -838,7 +838,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);
@ -906,7 +906,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
}
}
/* send the info */
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);
@ -931,7 +931,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);
@ -1012,7 +1012,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
}
}
/* send the info */
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(sender, answer, ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);
@ -1205,7 +1205,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
ret = ORTE_ERR_COMM_FAILURE;
break;
}
if (ORTE_SUCCESS != (ret = orte_odls_base.comm(return_addr, answer, ORTE_RML_TAG_TOOL, NULL))) {
if (ORTE_SUCCESS != (ret = orte_comm(return_addr, answer, ORTE_RML_TAG_TOOL, NULL))) {
ORTE_ERROR_LOG(ret);
}
OBJ_RELEASE(answer);

Просмотреть файл

@ -33,8 +33,10 @@
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_value_array.h"
#include "opal/dss/dss.h"
#include "opal/threads/threads.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
@ -118,8 +120,13 @@ char *orted_launch_cmd = NULL;
/* list of local children on a daemon */
opal_list_t orte_local_children;
opal_mutex_t orte_local_children_lock;
opal_condition_t orte_local_children_cond;
/* list of job data for local children on a daemon */
opal_list_t orte_local_jobdata;
opal_mutex_t orte_local_jobdata_lock;
opal_condition_t orte_local_jobdata_cond;
/* IOF controls */
bool orte_tag_output;
@ -167,6 +174,9 @@ bool orte_report_bindings = false;
/* barrier control */
bool orte_do_not_barrier = false;
/* comm fn for updating state */
orte_default_comm_fn_t orte_comm;
#endif /* !ORTE_DISABLE_FULL_RTE */
int orte_debug_output = -1;
@ -474,6 +484,28 @@ orte_job_t* orte_get_job_data_object(orte_jobid_t job)
return (orte_job_t*)opal_pointer_array_get_item(orte_job_data, ljob);
}
int orte_global_comm(orte_process_name_t *recipient,
opal_buffer_t *buf, orte_rml_tag_t tag,
orte_default_cbfunc_t cbfunc)
{
int ret;
if (recipient->jobid == ORTE_PROC_MY_NAME->jobid &&
recipient->vpid == ORTE_PROC_MY_NAME->vpid &&
NULL != cbfunc) {
/* if I am the recipient and a direct fn is provided, use a message event */
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, buf, tag, cbfunc);
ret = ORTE_SUCCESS;
} else {
/* go ahead and send it */
if (0 > (ret = orte_rml.send_buffer(recipient, buf, tag, 0))) {
ORTE_ERROR_LOG(ret);
} else {
ret = ORTE_SUCCESS;
}
}
return ret;
}
/*
* CONSTRUCTORS, DESTRUCTORS, AND CLASS INSTANTIATIONS
@ -600,11 +632,21 @@ static void orte_job_construct(orte_job_t* job)
job->num_launched = 0;
job->num_reported = 0;
job->num_terminated = 0;
job->num_daemons_reported = 0;
job->abort = false;
job->aborted_proc = NULL;
OBJ_CONSTRUCT(&job->reported_lock, opal_mutex_t);
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
job->not_reported = true;
job->max_restarts = INT32_MAX;
job->launch_msg_sent.tv_sec = 0;
job->launch_msg_sent.tv_usec = 0;
job->max_launch_msg_recvd.tv_sec = 0;
job->max_launch_msg_recvd.tv_usec = 0;
#if OPAL_ENABLE_FT_CR == 1
job->ckpt_state = 0;
job->ckpt_snapshot_ref = NULL;
@ -650,6 +692,9 @@ static void orte_job_destruct(orte_job_t* job)
}
OBJ_RELEASE(job->procs);
OBJ_DESTRUCT(&job->reported_lock);
OBJ_DESTRUCT(&job->reported_cond);
#if OPAL_ENABLE_FT_CR == 1
if (NULL != job->ckpt_snapshot_ref) {
free(job->ckpt_snapshot_ref);

Просмотреть файл

@ -36,8 +36,10 @@
#include "opal/class/opal_pointer_array.h"
#include "opal/class/opal_value_array.h"
#include "opal/threads/threads.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/runtime.h"
@ -378,12 +380,22 @@ typedef struct {
orte_vpid_t num_reported;
/* number of procs terminated */
orte_vpid_t num_terminated;
/* number of daemons reported launched so we can track progress */
orte_vpid_t num_daemons_reported;
/* lock/cond/flag for tracking when all procs reported */
opal_mutex_t reported_lock;
opal_condition_t reported_cond;
bool not_reported;
/* did this job abort? */
bool abort;
/* proc that caused that to happen */
struct orte_proc_t *aborted_proc;
/* max number of times a process can be restarted */
int32_t max_restarts;
/* time launch message was sent */
struct timeval launch_msg_sent;
/* max time for launch msg to be received */
struct timeval max_launch_msg_recvd;
#if OPAL_ENABLE_FT_CR == 1
/* ckpt state */
size_t ckpt_state;
@ -595,8 +607,14 @@ ORTE_DECLSPEC extern char *orted_launch_cmd;
/* list of local children on a daemon */
ORTE_DECLSPEC extern opal_list_t orte_local_children;
ORTE_DECLSPEC extern opal_mutex_t orte_local_children_lock;
ORTE_DECLSPEC extern opal_condition_t orte_local_children_cond;
/* list of job data for local children on a daemon */
ORTE_DECLSPEC extern opal_list_t orte_local_jobdata;
ORTE_DECLSPEC extern opal_mutex_t orte_local_jobdata_lock;
ORTE_DECLSPEC extern opal_condition_t orte_local_jobdata_cond;
/* whether or not to forward SIGTSTP and SIGCONT signals */
ORTE_DECLSPEC extern bool orte_forward_job_control;
@ -643,6 +661,20 @@ ORTE_DECLSPEC extern bool orte_report_bindings;
/* barrier control */
ORTE_DECLSPEC extern bool orte_do_not_barrier;
/* comm interface */
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
typedef int (*orte_default_comm_fn_t)(orte_process_name_t *recipient,
opal_buffer_t *buf,
orte_rml_tag_t tag,
orte_default_cbfunc_t cbfunc);
/* comm fn for updating state */
ORTE_DECLSPEC extern orte_default_comm_fn_t orte_comm;
ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient,
opal_buffer_t *buf, orte_rml_tag_t tag,
orte_default_cbfunc_t cbfunc);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -40,6 +40,7 @@
#include "orte/mca/ess/ess.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/proc_info.h"
#include "orte/util/error_strings.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"

Просмотреть файл

@ -95,7 +95,7 @@ in the application to be terminated by signals sent by %s
#
[orterun:proc-exit-no-sync]
%s has exited due to process rank %lu with PID %lu on
node %s exiting improperly. There are two reasons this could occur:
node %s exiting improperly. There are three reasons this could occur:
1. this process did not call "init" before exiting, but others in
the job did. This can cause a job to hang indefinitely while it waits
@ -106,8 +106,16 @@ then ALL processes must call "init" prior to termination.
By rule, all processes that call "init" MUST call "finalize" prior to
exiting or it will be considered an "abnormal termination"
3. this process called "MPI_Abort" or "orte_abort" and the mca parameter
orte_create_session_dirs is set to false. In this case, the run-time cannot
detect that the abort call was an abnormal termination. Hence, the only
error message you will receive is this one.
This may have caused other processes in the application to be
terminated by signals sent by %s (as reported here).
You can avoid this message by specifying -quiet on the %s command line.
#
[orterun:proc-exit-no-sync-unknown]
%s has exited due to a process exiting without calling "finalize",

Просмотреть файл

@ -1094,7 +1094,7 @@ static void dump_aborted_procs(void)
} else {
orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
proc->node->name, orterun_basename);
proc->node->name, orterun_basename, orterun_basename);
}
}
return;

Просмотреть файл

@ -42,7 +42,8 @@ headers += \
util/name_fns.h \
util/proc_info.h \
util/session_dir.h \
util/show_help.h
util/show_help.h \
util/error_strings.h
libopen_rte_la_SOURCES += \
util/error_strings.c \

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -23,8 +24,9 @@
#include <stdio.h>
const char *
orte_err2str(int errnum)
#include "orte/util/error_strings.h"
const char *orte_err2str(int errnum)
{
const char *retval;
switch (errnum) {
@ -130,3 +132,82 @@ orte_err2str(int errnum)
return retval;
}
const char *orte_job_state_to_str(orte_job_state_t state)
{
switch(state) {
case ORTE_JOB_STATE_UNDEF:
return strdup("UNDEFINED");
case ORTE_JOB_STATE_INIT:
return strdup("INITIALIZED");
case ORTE_JOB_STATE_RESTART:
return strdup("RESTARTING");
case ORTE_JOB_STATE_LAUNCHED:
return strdup("LAUNCHED");
case ORTE_JOB_STATE_RUNNING:
return strdup("RUNNING");
case ORTE_JOB_STATE_SUSPENDED:
return strdup("SUSPENDED");
case ORTE_JOB_STATE_REGISTERED:
return strdup("SYNC REGISTERED");
case ORTE_JOB_STATE_UNTERMINATED:
return strdup("UNTERMINATED");
case ORTE_JOB_STATE_TERMINATED:
return strdup("NORMALLY TERMINATED");
case ORTE_JOB_STATE_ABORTED:
return strdup("ABORTED");
case ORTE_JOB_STATE_FAILED_TO_START:
return strdup("FAILED TO START");
case ORTE_JOB_STATE_ABORTED_BY_SIG:
return strdup("ABORTED BY SIGNAL");
case ORTE_JOB_STATE_ABORTED_WO_SYNC:
return strdup("TERMINATED WITHOUT SYNC");
case ORTE_JOB_STATE_KILLED_BY_CMD:
return strdup("KILLED BY INTERNAL COMMAND");
case ORTE_JOB_STATE_COMM_FAILED:
return strdup("COMMUNICATION FAILURE");
case ORTE_JOB_STATE_NEVER_LAUNCHED:
return strdup("NEVER LAUNCHED");
case ORTE_JOB_STATE_ABORT_ORDERED:
return strdup("ABORT IN PROGRESS");
default:
return strdup("UNKNOWN STATE!");
}
}
const char *orte_proc_state_to_str(orte_proc_state_t state)
{
switch(state) {
case ORTE_PROC_STATE_UNDEF:
return strdup("UNDEFINED");
case ORTE_PROC_STATE_INIT:
return strdup("INITIALIZED");
case ORTE_PROC_STATE_RESTART:
return strdup("RESTARTING");
case ORTE_PROC_STATE_LAUNCHED:
return strdup("LAUNCHED");
case ORTE_PROC_STATE_RUNNING:
return strdup("RUNNING");
case ORTE_PROC_STATE_REGISTERED:
return strdup("SYNC REGISTERED");
case ORTE_PROC_STATE_UNTERMINATED:
return strdup("UNTERMINATED");
case ORTE_PROC_STATE_TERMINATED:
return strdup("NORMALLY TERMINATED");
case ORTE_PROC_STATE_ABORTED:
return strdup("ABORTED");
case ORTE_PROC_STATE_FAILED_TO_START:
return strdup("FAILED TO START");
case ORTE_PROC_STATE_ABORTED_BY_SIG:
return strdup("ABORTED BY SIGNAL");
case ORTE_PROC_STATE_TERM_WO_SYNC:
return strdup("TERMINATED WITHOUT SYNC");
case ORTE_PROC_STATE_KILLED_BY_CMD:
return strdup("KILLED BY INTERNAL COMMAND");
case ORTE_PROC_STATE_COMM_FAILED:
return strdup("COMMUNICATION FAILURE");
default:
return strdup("UNKNOWN STATE!");
}
}

44
orte/util/error_strings.h Обычный файл
Просмотреть файл

@ -0,0 +1,44 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#ifndef _ORTE_ERROR_STRINGS_H_
#define _ORTE_ERROR_STRINGS_H_
#include "orte_config.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/plm/plm_types.h"
BEGIN_C_DECLS
ORTE_DECLSPEC const char *orte_err2str(int errnum);
ORTE_DECLSPEC const char *orte_job_state_to_str(orte_job_state_t state);
ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state);
END_C_DECLS
#endif

Просмотреть файл

@ -550,3 +550,24 @@ uint64_t orte_util_hash_name(const orte_process_name_t * name) {
return hash;
}
char *orte_pretty_print_timing(int64_t secs, int64_t usecs)
{
unsigned long minutes, seconds;
float fsecs;
char *timestring;
seconds = secs + (usecs / 1000000l);
minutes = seconds / 60l;
seconds = seconds % 60l;
if (0 == minutes && 0 == seconds) {
fsecs = ((float)(secs)*1000000.0 + (float)usecs) / 1000.0;
asprintf(&timestring, "%8.2f millisecs", fsecs);
} else {
asprintf(&timestring, "%3lu:%02lu min:sec", minutes, seconds);
}
return timestring;
}

Просмотреть файл

@ -66,6 +66,7 @@ ORTE_DECLSPEC char* orte_util_print_local_jobid(const orte_jobid_t job);
#define ORTE_LOCAL_JOBID_PRINT(n) \
orte_util_print_local_jobid(n)
ORTE_DECLSPEC char *orte_pretty_print_timing(int64_t secs, int64_t usecs);
/* a macro for identifying the job family - i.e., for
* extracting the mpirun-specific id field of the jobid