plm/alps: fix orted abort hang problem
Turns out the alps plm component wasn't changing the state of the job upon terminating the orted's in the case of an abnormal termination. This caused mpirun to hang with a zommbie'd aprun process if an orted on a node in the job was killed via signal.
Этот коммит содержится в:
родитель
81dc3a5db9
Коммит
c454d11b01
@ -460,6 +460,11 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
static int plm_alps_terminate_orteds(void)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
|
||||
"%s plm:alps: terminating orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* deregister the waitpid callback to ensure we don't make it look like
|
||||
* alps failed when it didn't. Since the alps may have already completed,
|
||||
@ -475,6 +480,12 @@ static int plm_alps_terminate_orteds(void)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_plm_base_framework.framework_output,
|
||||
"%s plm:alps: terminated orteds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user