1
1

Resolve a race condition that could cause us to hang during abnormal terminations due to multi-counting num_terminated

This commit was SVN r32660.
Этот коммит содержится в:
Ralph Castain 2014-09-02 00:32:52 +00:00
родитель edfbeba7bf
Коммит f2b26bde4c
2 изменённых файлов: 10 добавлений и 13 удалений

Просмотреть файл

@ -379,7 +379,6 @@ static void proc_errors(int fd, short args, void *cbdata)
*/
if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
pptr->state = state;
jdata->num_terminated++;
}
/* if we were ordered to terminate, mark this proc as dead and see if
@ -621,6 +620,10 @@ static void proc_errors(int fd, short args, void *cbdata)
}
break;
}
/* if the waitpid fired, be sure to let the state machine know */
if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
}
cleanup:
OBJ_RELEASE(caddy);

Просмотреть файл

@ -1509,9 +1509,6 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
/* regardless of our eventual code path, we need to
* flag that this proc has had its waitpid fired */
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
/* if the child was previously flagged as dead, then just
* update its exit status and
* ensure that its exit state gets reported to avoid hanging
@ -1529,15 +1526,6 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
goto MOVEON;
}
/* if IOF_COMPLETE has already been recvd, then we need
* to mark this proc as no longer alive - we do this
* here because some code paths go thru the errmgr
* instead of the state machine. There is no harm
* if this gets done again later */
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_IOF_COMPLETE)) {
ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_ALIVE);
}
/* if the proc called "abort", then we just need to flag that it
* came thru here */
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
@ -1549,6 +1537,9 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
state = ORTE_PROC_STATE_CALLED_ABORT;
/* regardless of our eventual code path, we need to
* flag that this proc has had its waitpid fired */
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
goto MOVEON;
}
@ -1573,6 +1564,9 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
"%s odls:waitpid_fired child %s was ordered to die",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
/* regardless of our eventual code path, we need to
* flag that this proc has had its waitpid fired */
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
goto MOVEON;
}