1
1

Ensure that prun doesn't prematurely exit

Ensure that prun doesn't exit until notified that its own child job
terminated.

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2018-01-11 19:03:32 -08:00
родитель 8f02596777
Коммит ac522a521f

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -98,6 +98,7 @@ static struct {
static opal_list_t job_info; static opal_list_t job_info;
static volatile bool active = false; static volatile bool active = false;
static orte_jobid_t myjobid = ORTE_JOBID_INVALID;
static int create_app(int argc, char* argv[], static int create_app(int argc, char* argv[],
opal_list_t *jdata, opal_list_t *jdata,
@ -177,7 +178,9 @@ static void evhandler(int status,
int jobstatus=0; int jobstatus=0;
orte_jobid_t jobid = ORTE_JOBID_INVALID; orte_jobid_t jobid = ORTE_JOBID_INVALID;
if (orte_cmd_options.verbose && NULL != info) { /* we should always have info returned to us - if not, there is
* nothing we can do */
if (NULL != info) {
OPAL_LIST_FOREACH(val, info, opal_value_t) { OPAL_LIST_FOREACH(val, info, opal_value_t) {
if (0 == strcmp(val->key, OPAL_PMIX_JOB_TERM_STATUS)) { if (0 == strcmp(val->key, OPAL_PMIX_JOB_TERM_STATUS)) {
jobstatus = val->data.integer; jobstatus = val->data.integer;
@ -185,13 +188,21 @@ static void evhandler(int status,
jobid = val->data.name.jobid; jobid = val->data.name.jobid;
} }
} }
opal_output(0, "JOB %s COMPLETED WITH STATUS %d", if (orte_cmd_options.verbose && (myjobid != ORTE_JOBID_INVALID && jobid == myjobid)) {
ORTE_JOBID_PRINT(jobid), jobstatus); opal_output(0, "JOB %s COMPLETED WITH STATUS %d",
ORTE_JOBID_PRINT(jobid), jobstatus);
}
} }
/* we _always_ have to execute the evhandler callback or
* else the event progress engine will hang */
if (NULL != cbfunc) { if (NULL != cbfunc) {
cbfunc(OPAL_SUCCESS, NULL, NULL, NULL, cbdata); cbfunc(OPAL_SUCCESS, NULL, NULL, NULL, cbdata);
} }
if (!fired) { /* only terminate if this was our job - keep in mind that we
* can get notifications of job termination prior to our spawn
* having completed! */
if (!fired && (myjobid != ORTE_JOBID_INVALID && jobid == myjobid)) {
fired = true; fired = true;
active = false; active = false;
} }
@ -207,7 +218,6 @@ int prun(int argc, char *argv[])
opal_pmix_app_t *app; opal_pmix_app_t *app;
opal_value_t *val; opal_value_t *val;
opal_list_t info; opal_list_t info;
opal_jobid_t jobid;
struct timespec tp = {0, 100000}; struct timespec tp = {0, 100000};
/* init the globals */ /* init the globals */
@ -622,7 +632,7 @@ int prun(int argc, char *argv[])
opal_list_append(&job_info, &val->super); opal_list_append(&job_info, &val->super);
} }
if (OPAL_SUCCESS != (rc = opal_pmix.spawn(&job_info, &apps, &jobid))) { if (OPAL_SUCCESS != (rc = opal_pmix.spawn(&job_info, &apps, &myjobid))) {
opal_output(0, "Job failed to spawn: %s", opal_strerror(rc)); opal_output(0, "Job failed to spawn: %s", opal_strerror(rc));
goto DONE; goto DONE;
} }
@ -630,7 +640,7 @@ int prun(int argc, char *argv[])
OPAL_LIST_DESTRUCT(&apps); OPAL_LIST_DESTRUCT(&apps);
if (orte_cmd_options.verbose) { if (orte_cmd_options.verbose) {
opal_output(0, "JOB %s EXECUTING", OPAL_JOBID_PRINT(jobid)); opal_output(0, "JOB %s EXECUTING", OPAL_JOBID_PRINT(myjobid));
} }
while (active) { while (active) {