1
1

Multiple sigchld reports can occur within a single event callback, so have to reap them until none remain. Also, need to ensure the daemon is flagged as alive prior to calling wait_cb

Refs trac:4717

This commit was SVN r32020.

The following Trac tickets were found above:
  Ticket 4717 --> https://svn.open-mpi.org/trac/ompi/ticket/4717
Этот коммит содержится в:
Ralph Castain 2014-06-17 18:46:40 +00:00
родитель 42bf7466fc
Коммит 5216bd5558
2 изменённых файлов: 27 добавлений и 27 удалений

Просмотреть файл

@ -921,6 +921,7 @@ static void process_launch_list(int fd, short args, void *cbdata)
} }
caddy = (orte_plm_rsh_caddy_t*)item; caddy = (orte_plm_rsh_caddy_t*)item;
/* register the sigchild callback */ /* register the sigchild callback */
ORTE_FLAG_SET(caddy->daemon, ORTE_PROC_FLAG_ALIVE);
orte_wait_cb(caddy->daemon, rsh_wait_daemon, (void*)caddy); orte_wait_cb(caddy->daemon, rsh_wait_daemon, (void*)caddy);
/* fork a child to exec the rsh/ssh session */ /* fork a child to exec the rsh/ssh session */

Просмотреть файл

@ -165,7 +165,7 @@ static void register_callback(int fd, short args, void *cbdata)
} }
} }
/* we just override any existing registration */ /* we just override any existing registration */
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) { OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
if (t2->child == trk->child) { if (t2->child == trk->child) {
t2->cbfunc = trk->cbfunc; t2->cbfunc = trk->cbfunc;
@ -248,33 +248,32 @@ static void wait_signal_callback(int fd, short event, void *arg)
return; return;
} }
/* retrieve the pid */ /* we can have multiple children leave but only get one
retry: * sigchild callback, so reap all the waitpids until we
pid = waitpid(-1, &status, WNOHANG); * don't get anything valid back */
if (-1 == pid && EINTR == errno) { while (1) {
/* try it again */ pid = waitpid(-1, &status, WNOHANG);
goto retry; if (-1 == pid && EINTR == errno) {
} /* try it again */
/* if we got garbage, then nothing we can do */ continue;
if (pid <= 0) { }
return; /* if we got garbage, then nothing we can do */
} if (pid <= 0) {
/* we are already in an event, so it is safe to access the list */
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
if (pid == t2->child->pid) {
/* found it! */
t2->child->exit_code = status;
if (NULL != t2->cbfunc) {
t2->cbfunc(t2->child, t2->cbdata);
}
opal_list_remove_item(&pending_cbs, &t2->super);
OBJ_RELEASE(t2);
return; return;
} }
/* we are already in an event, so it is safe to access the list */
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
if (pid == t2->child->pid) {
/* found it! */
t2->child->exit_code = status;
if (NULL != t2->cbfunc) {
t2->cbfunc(t2->child, t2->cbdata);
}
opal_list_remove_item(&pending_cbs, &t2->super);
OBJ_RELEASE(t2);
break;
}
}
} }
/* if we get here, then this sigchild occurred prior to someone
* registering it, or after someone mistakenly removed it. Either
* way, there really isn't anything we can do with it */
} }