
Start updating the various mappers to the new procedure. Remove the stale lama component as it is now very out-of-date. Bring round_robin and PPR online, and modify the mindist component (but cannot test/debug it). Remove unneeded test Fix memory corruption by re-initializing variable to NULL in loop Resolve the race condition identified by @ggouaillardet by resetting the mapped flag within the same event where it was set. There is no need to retain the flag beyond that point as it isn't used again. Add a new job attribute ORTE_JOB_FULLY_DESCRIBED to indicate that all the job information (including locations and binding) is included in the launch message. Thus, the backend daemons do not need to do any map computation for the job. Use this for the seq, rankfile, and mindist mappers until someone decides to update them. Note that this will maintain functionality, but means that users of those three mappers will see large launch messages and less performant scaling than those using the other mappers. Have the mindist module add procs to the job's proc array as it is a fully described module Protect the hnp-not-in-allocation case Per path suggested by Gilles - protect the HNP node when it gets added in the absence of any other allocation or hostfile Signed-off-by: Ralph Castain <rhc@open-mpi.org>
1019 строки
39 KiB
C
1019 строки
39 KiB
C
/*
|
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/** @file **/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#include "opal/class/opal_list.h"
|
|
#include "opal/mca/event/event.h"
|
|
#include "opal/mca/pmix/pmix.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/runtime/orte_wait.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/grpcomm/grpcomm.h"
|
|
#include "orte/mca/iof/base/base.h"
|
|
#include "orte/mca/rmaps/rmaps_types.h"
|
|
#include "orte/mca/plm/plm.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/routed/routed.h"
|
|
#include "orte/util/session_dir.h"
|
|
|
|
#include "orte/mca/state/base/base.h"
|
|
#include "orte/mca/state/base/state_private.h"
|
|
|
|
void orte_state_base_activate_job_state(orte_job_t *jdata,
|
|
orte_job_state_t state)
|
|
{
|
|
opal_list_item_t *itm, *any=NULL, *error=NULL;
|
|
orte_state_t *s;
|
|
orte_state_caddy_t *caddy;
|
|
|
|
for (itm = opal_list_get_first(&orte_job_states);
|
|
itm != opal_list_get_end(&orte_job_states);
|
|
itm = opal_list_get_next(itm)) {
|
|
s = (orte_state_t*)itm;
|
|
if (s->job_state == ORTE_JOB_STATE_ANY) {
|
|
/* save this place */
|
|
any = itm;
|
|
}
|
|
if (s->job_state == ORTE_JOB_STATE_ERROR) {
|
|
error = itm;
|
|
}
|
|
if (s->job_state == state) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"%s ACTIVATING JOB %s STATE %s PRI %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
|
|
orte_job_state_to_str(state), s->priority));
|
|
if (NULL == s->cbfunc) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"%s NULL CBFUNC FOR JOB %s STATE %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == jdata) ? "ALL" : ORTE_JOBID_PRINT(jdata->jobid),
|
|
orte_job_state_to_str(state)));
|
|
return;
|
|
}
|
|
caddy = OBJ_NEW(orte_state_caddy_t);
|
|
if (NULL != jdata) {
|
|
caddy->jdata = jdata;
|
|
caddy->job_state = state;
|
|
OBJ_RETAIN(jdata);
|
|
}
|
|
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
|
opal_event_set_priority(&caddy->ev, s->priority);
|
|
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
return;
|
|
}
|
|
}
|
|
/* if we get here, then the state wasn't found, so execute
|
|
* the default handler if it is defined
|
|
*/
|
|
if (ORTE_JOB_STATE_ERROR < state && NULL != error) {
|
|
s = (orte_state_t*)error;
|
|
} else if (NULL != any) {
|
|
s = (orte_state_t*)any;
|
|
} else {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"ACTIVATE: ANY STATE NOT FOUND"));
|
|
return;
|
|
}
|
|
if (NULL == s->cbfunc) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"ACTIVATE: ANY STATE HANDLER NOT DEFINED"));
|
|
return;
|
|
}
|
|
caddy = OBJ_NEW(orte_state_caddy_t);
|
|
if (NULL != jdata) {
|
|
caddy->jdata = jdata;
|
|
caddy->job_state = state;
|
|
OBJ_RETAIN(jdata);
|
|
}
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"%s ACTIVATING JOB %s STATE %s PRI %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid),
|
|
orte_job_state_to_str(state), s->priority));
|
|
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
|
opal_event_set_priority(&caddy->ev, s->priority);
|
|
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
}
|
|
|
|
|
|
int orte_state_base_add_job_state(orte_job_state_t state,
|
|
orte_state_cbfunc_t cbfunc,
|
|
int priority)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
/* check for uniqueness */
|
|
for (item = opal_list_get_first(&orte_job_states);
|
|
item != opal_list_get_end(&orte_job_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->job_state == state) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"DUPLICATE STATE DEFINED: %s",
|
|
orte_job_state_to_str(state)));
|
|
return ORTE_ERR_BAD_PARAM;
|
|
}
|
|
}
|
|
|
|
st = OBJ_NEW(orte_state_t);
|
|
st->job_state = state;
|
|
st->cbfunc = cbfunc;
|
|
st->priority = priority;
|
|
opal_list_append(&orte_job_states, &(st->super));
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_state_base_set_job_state_callback(orte_job_state_t state,
|
|
orte_state_cbfunc_t cbfunc)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
for (item = opal_list_get_first(&orte_job_states);
|
|
item != opal_list_get_end(&orte_job_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->job_state == state) {
|
|
st->cbfunc = cbfunc;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
|
|
/* if not found, assume SYS priority and install it */
|
|
st = OBJ_NEW(orte_state_t);
|
|
st->job_state = state;
|
|
st->cbfunc = cbfunc;
|
|
st->priority = ORTE_SYS_PRI;
|
|
opal_list_append(&orte_job_states, &(st->super));
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_state_base_set_job_state_priority(orte_job_state_t state,
|
|
int priority)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
for (item = opal_list_get_first(&orte_job_states);
|
|
item != opal_list_get_end(&orte_job_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->job_state == state) {
|
|
st->priority = priority;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
int orte_state_base_remove_job_state(orte_job_state_t state)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
for (item = opal_list_get_first(&orte_job_states);
|
|
item != opal_list_get_end(&orte_job_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->job_state == state) {
|
|
opal_list_remove_item(&orte_job_states, item);
|
|
OBJ_RELEASE(item);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
void orte_state_base_print_job_state_machine(void)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
opal_output(0, "ORTE_JOB_STATE_MACHINE:");
|
|
for (item = opal_list_get_first(&orte_job_states);
|
|
item != opal_list_get_end(&orte_job_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
opal_output(0, "\tState: %s cbfunc: %s",
|
|
orte_job_state_to_str(st->job_state),
|
|
(NULL == st->cbfunc) ? "NULL" : "DEFINED");
|
|
}
|
|
}
|
|
|
|
|
|
/**** PROC STATE MACHINE ****/
|
|
void orte_state_base_activate_proc_state(orte_process_name_t *proc,
|
|
orte_proc_state_t state)
|
|
{
|
|
opal_list_item_t *itm, *any=NULL, *error=NULL;
|
|
orte_state_t *s;
|
|
orte_state_caddy_t *caddy;
|
|
|
|
for (itm = opal_list_get_first(&orte_proc_states);
|
|
itm != opal_list_get_end(&orte_proc_states);
|
|
itm = opal_list_get_next(itm)) {
|
|
s = (orte_state_t*)itm;
|
|
if (s->proc_state == ORTE_PROC_STATE_ANY) {
|
|
/* save this place */
|
|
any = itm;
|
|
}
|
|
if (s->proc_state == ORTE_PROC_STATE_ERROR) {
|
|
error = itm;
|
|
}
|
|
if (s->proc_state == state) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"%s ACTIVATING PROC %s STATE %s PRI %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(proc),
|
|
orte_proc_state_to_str(state), s->priority));
|
|
if (NULL == s->cbfunc) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"%s NULL CBFUNC FOR PROC %s STATE %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(proc),
|
|
orte_proc_state_to_str(state)));
|
|
return;
|
|
}
|
|
caddy = OBJ_NEW(orte_state_caddy_t);
|
|
caddy->name = *proc;
|
|
caddy->proc_state = state;
|
|
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
|
opal_event_set_priority(&caddy->ev, s->priority);
|
|
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
return;
|
|
}
|
|
}
|
|
/* if we get here, then the state wasn't found, so execute
|
|
* the default handler if it is defined
|
|
*/
|
|
if (ORTE_PROC_STATE_ERROR < state && NULL != error) {
|
|
s = (orte_state_t*)error;
|
|
} else if (NULL != any) {
|
|
s = (orte_state_t*)any;
|
|
} else {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"INCREMENT: ANY STATE NOT FOUND"));
|
|
return;
|
|
}
|
|
if (NULL == s->cbfunc) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"ACTIVATE: ANY STATE HANDLER NOT DEFINED"));
|
|
return;
|
|
}
|
|
caddy = OBJ_NEW(orte_state_caddy_t);
|
|
caddy->name = *proc;
|
|
caddy->proc_state = state;
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"%s ACTIVATING PROC %s STATE %s PRI %d",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(proc),
|
|
orte_proc_state_to_str(state), s->priority));
|
|
opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy);
|
|
opal_event_set_priority(&caddy->ev, s->priority);
|
|
opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1);
|
|
}
|
|
|
|
int orte_state_base_add_proc_state(orte_proc_state_t state,
|
|
orte_state_cbfunc_t cbfunc,
|
|
int priority)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
/* check for uniqueness */
|
|
for (item = opal_list_get_first(&orte_proc_states);
|
|
item != opal_list_get_end(&orte_proc_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->proc_state == state) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output,
|
|
"DUPLICATE STATE DEFINED: %s",
|
|
orte_proc_state_to_str(state)));
|
|
return ORTE_ERR_BAD_PARAM;
|
|
}
|
|
}
|
|
|
|
st = OBJ_NEW(orte_state_t);
|
|
st->proc_state = state;
|
|
st->cbfunc = cbfunc;
|
|
st->priority = priority;
|
|
opal_list_append(&orte_proc_states, &(st->super));
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
int orte_state_base_set_proc_state_callback(orte_proc_state_t state,
|
|
orte_state_cbfunc_t cbfunc)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
for (item = opal_list_get_first(&orte_proc_states);
|
|
item != opal_list_get_end(&orte_proc_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->proc_state == state) {
|
|
st->cbfunc = cbfunc;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
int orte_state_base_set_proc_state_priority(orte_proc_state_t state,
|
|
int priority)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
for (item = opal_list_get_first(&orte_proc_states);
|
|
item != opal_list_get_end(&orte_proc_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->proc_state == state) {
|
|
st->priority = priority;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
int orte_state_base_remove_proc_state(orte_proc_state_t state)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
for (item = opal_list_get_first(&orte_proc_states);
|
|
item != opal_list_get_end(&orte_proc_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
if (st->proc_state == state) {
|
|
opal_list_remove_item(&orte_proc_states, item);
|
|
OBJ_RELEASE(item);
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
return ORTE_ERR_NOT_FOUND;
|
|
}
|
|
|
|
void orte_state_base_print_proc_state_machine(void)
|
|
{
|
|
opal_list_item_t *item;
|
|
orte_state_t *st;
|
|
|
|
opal_output(0, "ORTE_PROC_STATE_MACHINE:");
|
|
for (item = opal_list_get_first(&orte_proc_states);
|
|
item != opal_list_get_end(&orte_proc_states);
|
|
item = opal_list_get_next(item)) {
|
|
st = (orte_state_t*)item;
|
|
opal_output(0, "\tState: %s cbfunc: %s",
|
|
orte_proc_state_to_str(st->proc_state),
|
|
(NULL == st->cbfunc) ? "NULL" : "DEFINED");
|
|
}
|
|
}
|
|
|
|
static void cleanup_node(orte_proc_t *proc)
|
|
{
|
|
orte_node_t *node;
|
|
orte_proc_t *p;
|
|
int i;
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:cleanup_node on proc %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&proc->name)));
|
|
|
|
if (NULL == (node = proc->node)) {
|
|
return;
|
|
}
|
|
node->num_procs--;
|
|
node->slots_inuse--;
|
|
for (i=0; i < node->procs->size; i++) {
|
|
if (NULL == (p = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
|
continue;
|
|
}
|
|
if (p->name.jobid == proc->name.jobid &&
|
|
p->name.vpid == proc->name.vpid) {
|
|
opal_pointer_array_set_item(node->procs, i, NULL);
|
|
OBJ_RELEASE(p);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void orte_state_base_local_launch_complete(int fd, short argc, void *cbdata)
|
|
{
|
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
|
orte_job_t *jdata = state->jdata;
|
|
|
|
if (orte_report_launch_progress) {
|
|
if (0 == jdata->num_daemons_reported % 100 ||
|
|
jdata->num_daemons_reported == orte_process_info.num_procs) {
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
|
|
}
|
|
}
|
|
OBJ_RELEASE(state);
|
|
}
|
|
|
|
void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
|
|
{
|
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
orte_job_t *jdata = caddy->jdata;
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:cleanup on job %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)));
|
|
|
|
/* flag that we were notified */
|
|
jdata->state = ORTE_JOB_STATE_NOTIFIED;
|
|
/* send us back thru job complete */
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
|
OBJ_RELEASE(caddy);
|
|
}
|
|
|
|
void orte_state_base_report_progress(int fd, short argc, void *cbdata)
|
|
{
|
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
orte_job_t *jdata = caddy->jdata;
|
|
|
|
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
|
|
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
|
(int)jdata->num_launched, (int)jdata->num_procs);
|
|
OBJ_RELEASE(caddy);
|
|
}
|
|
|
|
static void _send_notification(int status,
|
|
orte_proc_state_t state,
|
|
orte_process_name_t *proc,
|
|
orte_process_name_t *target)
|
|
{
|
|
opal_buffer_t *buf;
|
|
orte_grpcomm_signature_t sig;
|
|
int rc;
|
|
opal_value_t kv, *kvptr;
|
|
orte_process_name_t daemon;
|
|
|
|
buf = OBJ_NEW(opal_buffer_t);
|
|
|
|
opal_output_verbose(5, orte_state_base_framework.framework_output,
|
|
"%s state:base:sending notification %s proc %s target %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_ERROR_NAME(status),
|
|
ORTE_NAME_PRINT(proc),
|
|
ORTE_NAME_PRINT(target));
|
|
|
|
/* pack the status */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
|
|
/* the source is the proc */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, proc, 1, ORTE_NAME))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
|
|
if (OPAL_ERR_PROC_ABORTED == status) {
|
|
/* we will pass four opal_value_t's */
|
|
rc = 4;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
/* pass along the affected proc(s) */
|
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
|
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
|
|
kv.type = OPAL_NAME;
|
|
kv.data.name.jobid = proc->jobid;
|
|
kv.data.name.vpid = proc->vpid;
|
|
kvptr = &kv;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_DESTRUCT(&kv);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
OBJ_DESTRUCT(&kv);
|
|
} else {
|
|
/* we are going to pass three opal_value_t's */
|
|
rc = 3;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* pass along the affected proc(s) */
|
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
|
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
|
|
kv.type = OPAL_NAME;
|
|
kv.data.name.jobid = proc->jobid;
|
|
kv.data.name.vpid = proc->vpid;
|
|
kvptr = &kv;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_DESTRUCT(&kv);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
OBJ_DESTRUCT(&kv);
|
|
|
|
/* pass along the proc(s) to be notified */
|
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
|
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
|
|
kv.type = OPAL_NAME;
|
|
kv.data.name.jobid = target->jobid;
|
|
kv.data.name.vpid = target->vpid;
|
|
kvptr = &kv;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_DESTRUCT(&kv);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
OBJ_DESTRUCT(&kv);
|
|
|
|
/* mark this as intended for non-default event handlers */
|
|
OBJ_CONSTRUCT(&kv, opal_value_t);
|
|
kv.key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
|
|
kv.type = OPAL_BOOL;
|
|
kv.data.flag = true;
|
|
kvptr = &kv;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_DESTRUCT(&kv);
|
|
OBJ_RELEASE(buf);
|
|
return;
|
|
}
|
|
OBJ_DESTRUCT(&kv);
|
|
|
|
/* if the targets are a wildcard, then xcast it to everyone */
|
|
if (ORTE_VPID_WILDCARD == target->vpid) {
|
|
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
|
|
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
|
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
|
sig.signature[0].vpid = ORTE_VPID_WILDCARD;
|
|
sig.sz = 1;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, buf))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
OBJ_DESTRUCT(&sig);
|
|
OBJ_RELEASE(buf);
|
|
} else {
|
|
/* get the daemon hosting the proc to be notified */
|
|
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
|
|
daemon.vpid = orte_get_proc_daemon_vpid(target);
|
|
/* send the notification to that daemon */
|
|
opal_output_verbose(5, orte_state_base_framework.framework_output,
|
|
"%s state:base:sending notification %s to proc %s at daemon %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_ERROR_NAME(status),
|
|
ORTE_NAME_PRINT(target),
|
|
ORTE_NAME_PRINT(&daemon));
|
|
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
|
&daemon, buf,
|
|
ORTE_RML_TAG_NOTIFICATION,
|
|
orte_rml_send_callback, NULL))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_RELEASE(buf);
|
|
}
|
|
}
|
|
}
|
|
|
|
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
|
{
|
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
orte_process_name_t *proc = &caddy->name;
|
|
orte_proc_state_t state = caddy->proc_state;
|
|
orte_job_t *jdata;
|
|
orte_proc_t *pdata;
|
|
int i;
|
|
char *rtmod;
|
|
orte_process_name_t parent, target, *npptr;
|
|
|
|
opal_output_verbose(5, orte_state_base_framework.framework_output,
|
|
"%s state:base:track_procs called for proc %s state %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(proc),
|
|
orte_proc_state_to_str(state));
|
|
|
|
/* get our "lifeline" routed module */
|
|
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
|
|
|
/* get the job object for this proc */
|
|
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
|
goto cleanup;
|
|
}
|
|
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
|
|
|
if (ORTE_PROC_STATE_RUNNING == state) {
|
|
/* update the proc state */
|
|
if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
|
|
pdata->state = state;
|
|
}
|
|
jdata->num_launched++;
|
|
if (jdata->num_launched == jdata->num_procs) {
|
|
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
|
|
} else {
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
|
|
}
|
|
}
|
|
} else if (ORTE_PROC_STATE_REGISTERED == state) {
|
|
/* update the proc state */
|
|
if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
|
|
pdata->state = state;
|
|
}
|
|
jdata->num_reported++;
|
|
if (jdata->num_reported == jdata->num_procs) {
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
|
|
}
|
|
} else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
|
|
/* update the proc state */
|
|
if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
|
|
pdata->state = state;
|
|
}
|
|
/* Release the IOF file descriptors */
|
|
if (NULL != orte_iof.close) {
|
|
orte_iof.close(proc, ORTE_IOF_STDALL);
|
|
}
|
|
ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
|
|
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) {
|
|
ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
|
|
}
|
|
} else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
|
|
/* update the proc state */
|
|
if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
|
|
pdata->state = state;
|
|
}
|
|
ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
|
|
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) {
|
|
ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
|
|
}
|
|
} else if (ORTE_PROC_STATE_TERMINATED == state) {
|
|
/* update the proc state */
|
|
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
|
|
if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
|
|
pdata->state = state;
|
|
}
|
|
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
|
|
/* tell the PMIx subsystem to cleanup this client */
|
|
opal_pmix.server_deregister_client(proc, NULL, NULL);
|
|
/* Clean up the session directory as if we were the process
|
|
* itself. This covers the case where the process died abnormally
|
|
* and didn't cleanup its own session directory.
|
|
*/
|
|
orte_session_dir_finalize(proc);
|
|
}
|
|
/* if we are trying to terminate and our routes are
|
|
* gone, then terminate ourselves IF no local procs
|
|
* remain (might be some from another job)
|
|
*/
|
|
if (orte_orteds_term_ordered &&
|
|
0 == orte_routed.num_routes(rtmod)) {
|
|
for (i=0; i < orte_local_children->size; i++) {
|
|
if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
|
ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
|
|
/* at least one is still alive */
|
|
goto cleanup;
|
|
}
|
|
}
|
|
/* call our appropriate exit procedure */
|
|
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
|
|
"%s state:base all routes and children gone - exiting",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
|
goto cleanup;
|
|
}
|
|
/* return the allocated slot for reuse */
|
|
cleanup_node(pdata);
|
|
/* track job status */
|
|
jdata->num_terminated++;
|
|
if (jdata->num_terminated == jdata->num_procs) {
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
|
/* if they requested notification upon completion, provide it */
|
|
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
|
|
/* notify_completion => notify the parent of the termination
|
|
* of this child job. So get the parent jobid info */
|
|
npptr = &parent;
|
|
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) {
|
|
/* notify everyone who asked for it */
|
|
target.jobid = jdata->jobid;
|
|
target.vpid = ORTE_VPID_WILDCARD;
|
|
_send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, ORTE_NAME_WILDCARD);
|
|
} else {
|
|
target.jobid = jdata->jobid;
|
|
target.vpid = ORTE_VPID_WILDCARD;
|
|
_send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, &parent);
|
|
}
|
|
}
|
|
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
|
|
!orte_job_term_ordered) {
|
|
/* if this was an abnormal term, notify the other procs of the termination */
|
|
parent.jobid = jdata->jobid;
|
|
parent.vpid = ORTE_VPID_WILDCARD;
|
|
_send_notification(OPAL_ERR_PROC_ABORTED, pdata->state, &pdata->name, &parent);
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
OBJ_RELEASE(caddy);
|
|
}
|
|
|
|
void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
|
{
|
|
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
|
orte_job_t *jdata = caddy->jdata;
|
|
|
|
orte_proc_t *proc;
|
|
int i;
|
|
orte_std_cntr_t j;
|
|
orte_job_t *job;
|
|
orte_node_t *node;
|
|
orte_job_map_t *map;
|
|
orte_std_cntr_t index;
|
|
bool one_still_alive;
|
|
orte_vpid_t lowest=0;
|
|
int32_t i32, *i32ptr;
|
|
uint32_t u32;
|
|
void *nptr;
|
|
char *rtmod;
|
|
|
|
opal_output_verbose(2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_complete on job %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
|
|
|
|
/* get our "lifeline" routed module */
|
|
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
|
|
|
|
|
if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
|
/* just check to see if the daemons are complete */
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_complete - received NULL job, checking daemons",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
goto CHECK_DAEMONS;
|
|
} else {
|
|
/* mark the job as terminated, but don't override any
|
|
* abnormal termination flags
|
|
*/
|
|
if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) {
|
|
jdata->state = ORTE_JOB_STATE_TERMINATED;
|
|
}
|
|
}
|
|
|
|
/* tell the IOF that the job is complete */
|
|
if (NULL != orte_iof.complete) {
|
|
orte_iof.complete(jdata);
|
|
}
|
|
|
|
/* tell the PMIx server to release its data */
|
|
if (NULL != opal_pmix.server_deregister_nspace) {
|
|
opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL);
|
|
}
|
|
|
|
i32ptr = &i32;
|
|
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) {
|
|
if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
|
|
/* update the exit code */
|
|
ORTE_UPDATE_EXIT_STATUS(lowest);
|
|
}
|
|
|
|
/* warn user */
|
|
opal_output(orte_clean_output,
|
|
"-------------------------------------------------------\n"
|
|
"While %s job %s terminated normally, %d %s. Further examination may be required.\n"
|
|
"-------------------------------------------------------",
|
|
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child",
|
|
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
|
|
i32, (1 == i32) ? "process returned\na non-zero exit code." :
|
|
"processes returned\nnon-zero exit codes.");
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(jdata->jobid),
|
|
orte_job_state_to_str(jdata->state)));
|
|
|
|
/* if this job is a continuously operating one, then don't do
|
|
* anything further - just return here
|
|
*/
|
|
if (NULL != jdata &&
|
|
(orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
|
|
ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) {
|
|
goto CHECK_ALIVE;
|
|
}
|
|
|
|
/* if the job that is being checked is the HNP, then we are
|
|
* trying to terminate the orteds. In that situation, we
|
|
* do -not- check all jobs - we simply notify the HNP
|
|
* that the orteds are complete. Also check special case
|
|
* if jdata is NULL - we want
|
|
* to definitely declare the job done if the orteds
|
|
* have completed, no matter what else may be happening.
|
|
* This can happen if a ctrl-c hits in the "wrong" place
|
|
* while launching
|
|
*/
|
|
CHECK_DAEMONS:
|
|
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
|
if (0 == orte_routed.num_routes(rtmod)) {
|
|
/* orteds are done! */
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s orteds complete - exiting",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
if (NULL == jdata) {
|
|
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
|
}
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
|
OBJ_RELEASE(caddy);
|
|
return;
|
|
}
|
|
OBJ_RELEASE(caddy);
|
|
return;
|
|
}
|
|
|
|
/* Release the resources used by this job. Since some errmgrs may want
|
|
* to continue using resources allocated to the job as part of their
|
|
* fault recovery procedure, we only do this once the job is "complete".
|
|
* Note that an aborted/killed job -is- flagged as complete and will
|
|
* therefore have its resources released. We need to do this after
|
|
* we call the errmgr so that any attempt to restart the job will
|
|
* avoid doing so in the exact same place as the current job
|
|
*/
|
|
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
|
|
map = jdata->map;
|
|
for (index = 0; index < map->nodes->size; index++) {
|
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
|
|
continue;
|
|
}
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s releasing procs for job %s from node %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(jdata->jobid), node->name));
|
|
for (i = 0; i < node->procs->size; i++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
|
continue;
|
|
}
|
|
if (proc->name.jobid != jdata->jobid) {
|
|
/* skip procs from another job */
|
|
continue;
|
|
}
|
|
node->slots_inuse--;
|
|
node->num_procs--;
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s releasing proc %s from node %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&proc->name), node->name));
|
|
/* set the entry in the node array to NULL */
|
|
opal_pointer_array_set_item(node->procs, i, NULL);
|
|
/* release the proc once for the map entry */
|
|
OBJ_RELEASE(proc);
|
|
}
|
|
/* set the node location to NULL */
|
|
opal_pointer_array_set_item(map->nodes, index, NULL);
|
|
/* maintain accounting */
|
|
OBJ_RELEASE(node);
|
|
}
|
|
OBJ_RELEASE(map);
|
|
jdata->map = NULL;
|
|
}
|
|
|
|
CHECK_ALIVE:
|
|
/* now check to see if all jobs are done - trigger notification of this jdata
|
|
* object when we find it
|
|
*/
|
|
one_still_alive = false;
|
|
j = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&job, &nptr);
|
|
while (OPAL_SUCCESS == j) {
|
|
/* skip the daemon job */
|
|
if (job->jobid == ORTE_PROC_MY_NAME->jobid) {
|
|
goto next;
|
|
}
|
|
/* if this is the job we are checking AND it normally terminated,
|
|
* then activate the "notify_completed" state - this will release
|
|
* the job state, but is provided so that the HNP main code can
|
|
* take alternative actions if desired. If the state is killed_by_cmd,
|
|
* then go ahead and release it. We cannot release it if it
|
|
* abnormally terminated as mpirun needs the info so it can
|
|
* report appropriately to the user
|
|
*
|
|
* NOTE: do not release the primary job (j=1) so we
|
|
* can pretty-print completion message
|
|
*/
|
|
if (NULL != jdata && job->jobid == jdata->jobid) {
|
|
if (jdata->state == ORTE_JOB_STATE_TERMINATED) {
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_completed state is terminated - activating notify",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED);
|
|
one_still_alive = true;
|
|
} else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD ||
|
|
jdata->state == ORTE_JOB_STATE_NOTIFIED) {
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_completed state is killed or notified - cleaning up",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
/* release this object, ensuring that the
|
|
* pointer array internal accounting
|
|
* is maintained!
|
|
*/
|
|
if (1 < j) {
|
|
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
|
/* this was a debugger daemon. notify that a debugger has detached */
|
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
|
|
}
|
|
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
|
|
OBJ_RELEASE(jdata);
|
|
}
|
|
}
|
|
goto next;
|
|
}
|
|
/* if the job is flagged to not be monitored, skip it */
|
|
if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) {
|
|
goto next;
|
|
}
|
|
/* when checking for job termination, we must be sure to NOT check
|
|
* our own job as it - rather obviously - has NOT terminated!
|
|
*/
|
|
if (ORTE_JOB_STATE_NOTIFIED != job->state) {
|
|
/* we have at least one job that is not done yet - we cannot
|
|
* just return, though, as we need to ensure we cleanout the
|
|
* job data for the job that just completed
|
|
*/
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_completed job %s is not terminated (%d:%d)",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(job->jobid),
|
|
job->num_terminated, job->num_procs));
|
|
one_still_alive = true;
|
|
}
|
|
else {
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(job->jobid),
|
|
job->num_terminated, job->num_procs,
|
|
(NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) ));
|
|
}
|
|
next:
|
|
j = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&job, nptr, &nptr);
|
|
}
|
|
|
|
/* if a job is still alive, we just return */
|
|
if (one_still_alive) {
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_completed at least one job is not terminated",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
OBJ_RELEASE(caddy);
|
|
return;
|
|
}
|
|
/* if we get here, then all jobs are done, so terminate */
|
|
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
|
|
"%s state:base:check_job_completed all jobs terminated",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* stop the job timeout event, if set */
|
|
if (NULL != orte_mpiexec_timeout) {
|
|
OBJ_RELEASE(orte_mpiexec_timeout);
|
|
orte_mpiexec_timeout = NULL;
|
|
}
|
|
|
|
/* set the exit status to 0 - this will only happen if it
|
|
* wasn't already set by an error condition
|
|
*/
|
|
ORTE_UPDATE_EXIT_STATUS(0);
|
|
|
|
/* order daemon termination - this tells us to cleanup
|
|
* our local procs as well as telling remote daemons
|
|
* to die
|
|
*/
|
|
orte_plm.terminate_orteds();
|
|
|
|
OBJ_RELEASE(caddy);
|
|
}
|