Cleanup a race condition regarding marking that waitpid_fired. We should always mark it as fired when we enter the wait_local_proc routine, and also mark it as no longer alive if iof_complete has also been found. If other places in the code also update those flags, there is no harm done.
This commit was SVN r32643.
Этот коммит содержится в:
родитель
6916bfc368
Коммит
2b225e3776
@ -603,7 +603,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
break;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* abnormal termination - abort, but only do it once
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
|
@ -354,9 +354,6 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
|
||||
/* update the state */
|
||||
child->state = state;
|
||||
/* the odls will not have flagged the waitpid as
|
||||
* fired as it leaves that for us to do */
|
||||
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
|
||||
/* report this as abnormal termination to the HNP, unless we already have
|
||||
* done so for this job */
|
||||
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
|
||||
|
@ -1509,28 +1509,9 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
|
||||
|
||||
/* if the proc called "abort", then we just need to flag that it
|
||||
* came thru here */
|
||||
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
|
||||
/* even though the process exited "normally", it happened
|
||||
* via an orte_abort call
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:waitpid_fired child %s died by call to abort",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||
/* since we are going down a different code path, we need to
|
||||
* flag that this proc has had its waitpid fired */
|
||||
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
|
||||
/* if IOF_COMPLETE has already been recvd, then we need
|
||||
* to mark this proc as no longer alive */
|
||||
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_IOF_COMPLETE)) {
|
||||
ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
/* regardless of our eventual code path, we need to
|
||||
* flag that this proc has had its waitpid fired */
|
||||
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
|
||||
/* if the child was previously flagged as dead, then just
|
||||
* update its exit status and
|
||||
* ensure that its exit state gets reported to avoid hanging
|
||||
@ -1548,6 +1529,29 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
/* if IOF_COMPLETE has already been recvd, then we need
|
||||
* to mark this proc as no longer alive - we do this
|
||||
* here because some code paths go thru the errmgr
|
||||
* instead of the state machine. There is no harm
|
||||
* if this gets done again later */
|
||||
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_IOF_COMPLETE)) {
|
||||
ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_ALIVE);
|
||||
}
|
||||
|
||||
/* if the proc called "abort", then we just need to flag that it
|
||||
* came thru here */
|
||||
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
|
||||
/* even though the process exited "normally", it happened
|
||||
* via an orte_abort call
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:waitpid_fired child %s died by call to abort",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
/* get the jobdat for this child */
|
||||
if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user