From 388034c814cb16eef78d9b853c4d67cb715c6212 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 10 Oct 2017 10:08:30 -0700 Subject: [PATCH] Add support for the -v (verbose) option to prun and silence the "executing" and "completed" output otherwise. Debounce "unreachable" notifications for tools when they disconnect Enable the -x cmd line option for prun Signed-off-by: Ralph Castain (cherry picked from commit 0a5b36180a22959654461ac1303cec35313f8b4a) --- .../pmix/src/event/pmix_event_notification.c | 4 +- .../pmix/src/mca/ptl/base/ptl_base_sendrecv.c | 2 +- orte/mca/state/dvm/state_dvm.c | 23 +++-- orte/tools/prun/prun.c | 95 ++++++++++++++++++- 4 files changed, 106 insertions(+), 18 deletions(-) diff --git a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c index b6f0458a3d..f7bbd2ff88 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c +++ b/opal/mca/pmix/pmix3x/pmix/src/event/pmix_event_notification.c @@ -815,7 +815,9 @@ static void _notify_client_event(int sd, short args, void *cbdata) /* check for caching instructions */ for (n=0; n < cd->ninfo; n++) { if (0 == strncmp(cd->info[n].key, PMIX_EVENT_DO_NOT_CACHE, PMIX_MAX_KEYLEN)) { - holdcd = PMIX_INFO_TRUE(&cd->info[n]); + if (PMIX_INFO_TRUE(&cd->info[n])) { + holdcd = false; + } break; } } diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c index 85f0085cc1..97149094c9 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c @@ -140,7 +140,7 @@ void pmix_ptl_base_lost_connection(pmix_peer_t *peer, pmix_status_t err) } } } - if (!peer->finalized) { + if (!peer->finalized && !PMIX_PROC_IS_TOOL(peer)) { /* if this peer already called finalize, then * we are just seeing their connection go away * when they terminate - so do not generate diff --git a/orte/mca/state/dvm/state_dvm.c b/orte/mca/state/dvm/state_dvm.c index 426ffc813b..06dfe88470 100644 --- a/orte/mca/state/dvm/state_dvm.c +++ b/orte/mca/state/dvm/state_dvm.c @@ -599,19 +599,18 @@ static void dvm_notify(int sd, short args, void *cbdata) val->type = OPAL_STATUS; val->data.status = ret; opal_list_append(info, &val->super); - /* if there was a problem, we need to send the requestor more info about what happened */ - if (ORTE_SUCCESS != ret) { - val = OBJ_NEW(opal_value_t); - val->key = strdup(OPAL_PMIX_PROCID); - val->type = OPAL_NAME; - val->data.name.jobid = jdata->jobid; - if (NULL != pptr) { - val->data.name.vpid = pptr->name.vpid; - } else { - val->data.name.vpid = ORTE_VPID_WILDCARD; - } - opal_list_append(info, &val->super); + /* tell the requestor which job or proc */ + val = OBJ_NEW(opal_value_t); + val->key = strdup(OPAL_PMIX_PROCID); + val->type = OPAL_NAME; + val->data.name.jobid = jdata->jobid; + if (NULL != pptr) { + val->data.name.vpid = pptr->name.vpid; + } else { + val->data.name.vpid = ORTE_VPID_WILDCARD; } + opal_list_append(info, &val->super); + /* setup the caddy */ mycaddy = (mycaddy_t*)malloc(sizeof(mycaddy_t)); mycaddy->info = info; OBJ_RETAIN(jdata); diff --git a/orte/tools/prun/prun.c b/orte/tools/prun/prun.c index 5c8852cbae..155ea98cd1 100644 --- a/orte/tools/prun/prun.c +++ b/orte/tools/prun/prun.c @@ -174,14 +174,19 @@ static void evhandler(int status, void *cbdata) { opal_value_t *val; + int jobstatus=0; + orte_jobid_t jobid = ORTE_JOBID_INVALID; - if (NULL != info) { + if (orte_cmd_options.verbose && NULL != info) { OPAL_LIST_FOREACH(val, info, opal_value_t) { if (0 == strcmp(val->key, OPAL_PMIX_JOB_TERM_STATUS)) { - opal_output(0, "JOB COMPLETED WITH STATUS %d", - val->data.integer); + jobstatus = val->data.integer; + } else if (0 == strcmp(val->key, OPAL_PMIX_PROCID)) { + jobid = val->data.name.jobid; } } + opal_output(0, "JOB %s COMPLETED WITH STATUS %d", + ORTE_JOBID_PRINT(jobid), jobstatus); } if (NULL != cbfunc) { cbfunc(OPAL_SUCCESS, NULL, NULL, NULL, cbdata); @@ -622,7 +627,9 @@ int prun(int argc, char *argv[]) OPAL_LIST_DESTRUCT(&job_info); OPAL_LIST_DESTRUCT(&apps); - opal_output(0, "JOB %s EXECUTING", OPAL_JOBID_PRINT(jobid)); + if (orte_cmd_options.verbose) { + opal_output(0, "JOB %s EXECUTING", OPAL_JOBID_PRINT(jobid)); + } while (active) { nanosleep(&tp, NULL); @@ -788,6 +795,86 @@ static int create_app(int argc, char* argv[], } } + /* set necessary env variables for external usage from tune conf file*/ + int set_from_file = 0; + char **vars = NULL; + if (OPAL_SUCCESS == mca_base_var_process_env_list_from_file(&vars) && + NULL != vars) { + for (i=0; NULL != vars[i]; i++) { + value = strchr(vars[i], '='); + /* terminate the name of the param */ + *value = '\0'; + /* step over the equals */ + value++; + /* overwrite any prior entry */ + opal_setenv(vars[i], value, true, &app->env); + /* save it for any comm_spawn'd apps */ + opal_setenv(vars[i], value, true, &orte_forwarded_envars); + } + set_from_file = 1; + opal_argv_free(vars); + } + /* Did the user request to export any environment variables on the cmd line? */ + char *env_set_flag; + env_set_flag = getenv("OMPI_MCA_mca_base_env_list"); + if (opal_cmd_line_is_taken(orte_cmd_line, "x")) { + if (NULL != env_set_flag) { + opal_show_help("help-orterun.txt", "orterun:conflict-env-set", false); + return ORTE_ERR_FATAL; + } + j = opal_cmd_line_get_ninsts(orte_cmd_line, "x"); + for (i = 0; i < j; ++i) { + param = opal_cmd_line_get_param(orte_cmd_line, "x", i, 0); + + if (NULL != (value = strchr(param, '='))) { + /* terminate the name of the param */ + *value = '\0'; + /* step over the equals */ + value++; + /* overwrite any prior entry */ + opal_setenv(param, value, true, &app->env); + /* save it for any comm_spawn'd apps */ + opal_setenv(param, value, true, &orte_forwarded_envars); + } else { + value = getenv(param); + if (NULL != value) { + /* overwrite any prior entry */ + opal_setenv(param, value, true, &app->env); + /* save it for any comm_spawn'd apps */ + opal_setenv(param, value, true, &orte_forwarded_envars); + } else { + opal_output(0, "Warning: could not find environment variable \"%s\"\n", param); + } + } + } + } else if (NULL != env_set_flag) { + /* if mca_base_env_list was set, check if some of env vars were set via -x from a conf file. + * If this is the case, error out. + */ + if (!set_from_file) { + /* set necessary env variables for external usage */ + vars = NULL; + if (OPAL_SUCCESS == mca_base_var_process_env_list(env_set_flag, &vars) && + NULL != vars) { + for (i=0; NULL != vars[i]; i++) { + value = strchr(vars[i], '='); + /* terminate the name of the param */ + *value = '\0'; + /* step over the equals */ + value++; + /* overwrite any prior entry */ + opal_setenv(vars[i], value, true, &app->env); + /* save it for any comm_spawn'd apps */ + opal_setenv(vars[i], value, true, &orte_forwarded_envars); + } + opal_argv_free(vars); + } + } else { + opal_show_help("help-orterun.txt", "orterun:conflict-env-set", false); + return ORTE_ERR_FATAL; + } + } + /* Did the user request a specific wdir? */ if (NULL != orte_cmd_options.wdir) {