1
1

Cleanup a race condition regarding marking that waitpid_fired. We should always mark it as fired when we enter the wait_local_proc routine, and also mark it as no longer alive if iof_complete has also been found. If other places in the code also update those flags, there is no harm done.

This commit was SVN r32643.
Этот коммит содержится в:
Ralph Castain 2014-08-29 17:03:31 +00:00
родитель 6916bfc368
Коммит 2b225e3776
3 изменённых файлов: 27 добавлений и 26 удалений

Просмотреть файл

@ -603,7 +603,7 @@ static void proc_errors(int fd, short args, void *cbdata)
break;
}
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* abnormal termination - abort, but only do it once
/* abnormal termination - abort, but only do it once
* to avoid creating a lot of confusion */
default_hnp_abort(jdata);
}

Просмотреть файл

@ -354,9 +354,6 @@ static void proc_errors(int fd, short args, void *cbdata)
if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
/* update the state */
child->state = state;
/* the odls will not have flagged the waitpid as
* fired as it leaves that for us to do */
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
/* report this as abnormal termination to the HNP, unless we already have
* done so for this job */
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {

Просмотреть файл

@ -1509,28 +1509,9 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
/* if the proc called "abort", then we just need to flag that it
* came thru here */
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
/* even though the process exited "normally", it happened
* via an orte_abort call
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:waitpid_fired child %s died by call to abort",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
state = ORTE_PROC_STATE_CALLED_ABORT;
/* since we are going down a different code path, we need to
* flag that this proc has had its waitpid fired */
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
/* if IOF_COMPLETE has already been recvd, then we need
* to mark this proc as no longer alive */
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_IOF_COMPLETE)) {
ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_ALIVE);
}
goto MOVEON;
}
/* regardless of our eventual code path, we need to
* flag that this proc has had its waitpid fired */
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
/* if the child was previously flagged as dead, then just
* update its exit status and
* ensure that its exit state gets reported to avoid hanging
@ -1548,6 +1529,29 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
goto MOVEON;
}
/* if IOF_COMPLETE has already been recvd, then we need
* to mark this proc as no longer alive - we do this
* here because some code paths go thru the errmgr
* instead of the state machine. There is no harm
* if this gets done again later */
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_IOF_COMPLETE)) {
ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_ALIVE);
}
/* if the proc called "abort", then we just need to flag that it
* came thru here */
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
/* even though the process exited "normally", it happened
* via an orte_abort call
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
"%s odls:waitpid_fired child %s died by call to abort",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
state = ORTE_PROC_STATE_CALLED_ABORT;
goto MOVEON;
}
/* get the jobdat for this child */
if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);