Resolve a race condition that could cause us to hang during abnormal terminations due to multi-counting num_terminated
This commit was SVN r32660.
Этот коммит содержится в:
родитель
edfbeba7bf
Коммит
f2b26bde4c
@ -379,7 +379,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
*/
|
||||
if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
|
||||
pptr->state = state;
|
||||
jdata->num_terminated++;
|
||||
}
|
||||
|
||||
/* if we were ordered to terminate, mark this proc as dead and see if
|
||||
@ -621,6 +620,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* if the waitpid fired, be sure to let the state machine know */
|
||||
if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_RELEASE(caddy);
|
||||
|
@ -1509,9 +1509,6 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
|
||||
|
||||
/* regardless of our eventual code path, we need to
|
||||
* flag that this proc has had its waitpid fired */
|
||||
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
|
||||
/* if the child was previously flagged as dead, then just
|
||||
* update its exit status and
|
||||
* ensure that its exit state gets reported to avoid hanging
|
||||
@ -1529,15 +1526,6 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
/* if IOF_COMPLETE has already been recvd, then we need
|
||||
* to mark this proc as no longer alive - we do this
|
||||
* here because some code paths go thru the errmgr
|
||||
* instead of the state machine. There is no harm
|
||||
* if this gets done again later */
|
||||
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_IOF_COMPLETE)) {
|
||||
ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
|
||||
/* if the proc called "abort", then we just need to flag that it
|
||||
* came thru here */
|
||||
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
|
||||
@ -1549,6 +1537,9 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||
/* regardless of our eventual code path, we need to
|
||||
* flag that this proc has had its waitpid fired */
|
||||
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
@ -1573,6 +1564,9 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
"%s odls:waitpid_fired child %s was ordered to die",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
/* regardless of our eventual code path, we need to
|
||||
* flag that this proc has had its waitpid fired */
|
||||
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user