1
1

Enable ORTE to continue running when a node fails - user takes responsibility for zombies. Minor cleanup to orte-clean

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-06-27 09:05:26 -07:00
родитель c885ee3f3c
Коммит 8a4565874e
3 изменённых файлов: 25 добавлений и 17 удалений

Просмотреть файл

@ -443,12 +443,6 @@ static void proc_errors(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
/* record the first one to fail */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* output an error message so the user knows what happened */
orte_show_help("help-errmgr-base.txt", "node-died", true,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename,
ORTE_NAME_PRINT(proc),
pptr->node->name);
/* mark the daemon job as failed */
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
/* point to the lowest rank to cause the problem */
@ -456,14 +450,25 @@ static void proc_errors(int fd, short args, void *cbdata)
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(pptr);
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
/* update our exit code */
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* just in case the exit code hadn't been set, do it here - this
* won't override any reported exit code */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
if (!orte_enable_recovery) {
/* output an error message so the user knows what happened */
orte_show_help("help-errmgr-base.txt", "node-died", true,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename,
ORTE_NAME_PRINT(proc),
pptr->node->name);
/* update our exit code */
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
/* just in case the exit code hadn't been set, do it here - this
* won't override any reported exit code */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
}
}
/* if recovery is enabled, then we are done - otherwise,
* abort the system */
if (!orte_enable_recovery) {
default_hnp_abort(jdata);
}
/* abort the system */
default_hnp_abort(jdata);
goto cleanup;
}
@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata)
keep_going:
/* if this is a continuously operating job, then there is nothing more
* to do - we let the job continue to run */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
/* always mark the waitpid as having fired */
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
/* if this is a remote proc, we won't hear anything more about it

Просмотреть файл

@ -267,8 +267,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* start one orted on each node */
opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
/* alert us if any orteds die during startup */
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
if (!orte_enable_recovery) {
/* kill the job if any orteds die */
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
}
/* ensure the orteds are not bound to a single processor,
* just in case the TaskAffinity option is set by default.

Просмотреть файл

@ -183,7 +183,7 @@ main(int argc, char *argv[])
free(legacy);
/* and finally get rid of any lingering pmix-related artifacts */
asprintf(&legacy, "rm -f %s/pmix*", orte_process_info.tmpdir_base);
asprintf(&legacy, "rm -rf %s/pmix*", orte_process_info.tmpdir_base);
system(legacy);
free(legacy);