Multiple sigchld reports can occur within a single event callback, so have to reap them until none remain. Also, need to ensure the daemon is flagged as alive prior to calling wait_cb
Refs trac:4717 This commit was SVN r32020. The following Trac tickets were found above: Ticket 4717 --> https://svn.open-mpi.org/trac/ompi/ticket/4717
Этот коммит содержится в:
родитель
42bf7466fc
Коммит
5216bd5558
@ -921,6 +921,7 @@ static void process_launch_list(int fd, short args, void *cbdata)
|
||||
}
|
||||
caddy = (orte_plm_rsh_caddy_t*)item;
|
||||
/* register the sigchild callback */
|
||||
ORTE_FLAG_SET(caddy->daemon, ORTE_PROC_FLAG_ALIVE);
|
||||
orte_wait_cb(caddy->daemon, rsh_wait_daemon, (void*)caddy);
|
||||
|
||||
/* fork a child to exec the rsh/ssh session */
|
||||
|
@ -165,7 +165,7 @@ static void register_callback(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
/* we just override any existing registration */
|
||||
/* we just override any existing registration */
|
||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||
if (t2->child == trk->child) {
|
||||
t2->cbfunc = trk->cbfunc;
|
||||
@ -248,33 +248,32 @@ static void wait_signal_callback(int fd, short event, void *arg)
|
||||
return;
|
||||
}
|
||||
|
||||
/* retrieve the pid */
|
||||
retry:
|
||||
pid = waitpid(-1, &status, WNOHANG);
|
||||
if (-1 == pid && EINTR == errno) {
|
||||
/* try it again */
|
||||
goto retry;
|
||||
}
|
||||
/* if we got garbage, then nothing we can do */
|
||||
if (pid <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* we are already in an event, so it is safe to access the list */
|
||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||
if (pid == t2->child->pid) {
|
||||
/* found it! */
|
||||
t2->child->exit_code = status;
|
||||
if (NULL != t2->cbfunc) {
|
||||
t2->cbfunc(t2->child, t2->cbdata);
|
||||
}
|
||||
opal_list_remove_item(&pending_cbs, &t2->super);
|
||||
OBJ_RELEASE(t2);
|
||||
/* we can have multiple children leave but only get one
|
||||
* sigchild callback, so reap all the waitpids until we
|
||||
* don't get anything valid back */
|
||||
while (1) {
|
||||
pid = waitpid(-1, &status, WNOHANG);
|
||||
if (-1 == pid && EINTR == errno) {
|
||||
/* try it again */
|
||||
continue;
|
||||
}
|
||||
/* if we got garbage, then nothing we can do */
|
||||
if (pid <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* we are already in an event, so it is safe to access the list */
|
||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||
if (pid == t2->child->pid) {
|
||||
/* found it! */
|
||||
t2->child->exit_code = status;
|
||||
if (NULL != t2->cbfunc) {
|
||||
t2->cbfunc(t2->child, t2->cbdata);
|
||||
}
|
||||
opal_list_remove_item(&pending_cbs, &t2->super);
|
||||
OBJ_RELEASE(t2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* if we get here, then this sigchild occurred prior to someone
|
||||
* registering it, or after someone mistakenly removed it. Either
|
||||
* way, there really isn't anything we can do with it */
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user