1
1

Complete the fix of the orted vs mpirun race condition for finalizing. The darned mpirun is just too fast! Rather than try to slow it down, we set the orte_finalizing flag -prior- to telling mpirun the orted is leaving. This ensures we don't mistakenly declare the lifeline lost when mpirun leaves in a hurry.

This commit was SVN r17897.
Этот коммит содержится в:
Ralph Castain 2008-03-20 16:55:24 +00:00
родитель 6bb139e4f2
Коммит f8a10dfb93
2 изменённых файлов: 17 добавлений и 20 удалений

Просмотреть файл

@ -236,6 +236,23 @@ error:
int orte_ess_base_orted_finalize(void)
{
opal_buffer_t ack;
orte_proc_state_t state=ORTE_PROC_STATE_TERMINATED;
orte_exit_code_t exit_code=0;
orte_plm_cmd_flag_t cmd = ORTE_PLM_UPDATE_PROC_STATE;
/* send a state update so the HNP knows we are "gone" */
OBJ_CONSTRUCT(&ack, opal_buffer_t);
opal_dss.pack(&ack, &cmd, 1, ORTE_PLM_CMD);
opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->jobid), 1, ORTE_JOBID);
opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->vpid), 1, ORTE_VPID);
opal_dss.pack(&ack, &state, 1, ORTE_PROC_STATE);
opal_dss.pack(&ack, &exit_code, 1, ORTE_EXIT_CODE);
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0);
OBJ_DESTRUCT(&ack);
/* progress the OOB to ensure the message gets out */
opal_progress();
orte_cr_finalize();
#if OPAL_ENABLE_FT == 1

Просмотреть файл

@ -561,10 +561,6 @@ int orte_daemon(int argc, char *argv[])
static void shutdown_callback(int fd, short flags, void *arg)
{
opal_buffer_t ack;
orte_proc_state_t state=ORTE_PROC_STATE_TERMINATED;
orte_exit_code_t exit_code=0;
orte_plm_cmd_flag_t cmd = ORTE_PLM_UPDATE_PROC_STATE;
int ret;
/* protect against multiple calls */
@ -591,22 +587,6 @@ static void shutdown_callback(int fd, short flags, void *arg)
*/
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
/* if we are not the HNP, send a state update so
* the HNP knows we are "gone"
*/
if (!orte_process_info.hnp) {
OBJ_CONSTRUCT(&ack, opal_buffer_t);
opal_dss.pack(&ack, &cmd, 1, ORTE_PLM_CMD);
opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->jobid), 1, ORTE_JOBID);
opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->vpid), 1, ORTE_VPID);
opal_dss.pack(&ack, &state, 1, ORTE_PROC_STATE);
opal_dss.pack(&ack, &exit_code, 1, ORTE_EXIT_CODE);
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0);
OBJ_DESTRUCT(&ack);
/* progress the OOB to ensure the message gets out */
opal_progress();
}
/* Finalize and clean up ourselves */
if (ORTE_SUCCESS != (ret = orte_finalize())) {
ORTE_ERROR_LOG(ret);