This isn't as big a change as it appears - a change in one place caused a whole bunch of files to require updated #include's due to some arcane linkage. Rework the orte_wait code to reflect the introduction of the state machine. If we are in cleanup mode and just want to kill all our local children, then there is no reason to be polite about it as that introduces *very* long delays at scale. Just kill the procs and move on.
Refs trac:4717 This commit was SVN r32019. The following Trac tickets were found above: Ticket 4717 --> https://svn.open-mpi.org/trac/ompi/ticket/4717
Этот коммит содержится в:
родитель
390f8f52b4
Коммит
42bf7466fc
@ -460,9 +460,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
/* abnormal termination - abort */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
@ -478,9 +479,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
/* abnormal termination - abort */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
@ -502,9 +504,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
* we overwrite the process' exit code with the default error code
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
/* abnormal termination - abort */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
@ -526,14 +529,15 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
/* if this was a daemon, report it */
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* output a message indicating we failed to launch a daemon */
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||
}
|
||||
/* abnormal termination - abort */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
@ -549,9 +553,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
/* abnormal termination - abort */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||
@ -575,9 +580,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
/* user requested we abort in this scenario */
|
||||
default_hnp_abort(jdata);
|
||||
} else {
|
||||
/* user requested we consider this normal termination */
|
||||
if (jdata->num_terminated >= jdata->num_procs) {
|
||||
@ -600,11 +606,12 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
OBJ_RETAIN(pptr);
|
||||
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* kill all jobs */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
|
||||
@ -619,8 +626,11 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
/* kill all jobs */
|
||||
default_hnp_abort(jdata);
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -416,7 +416,7 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
|
||||
if (ORTE_PROC_STATE_TERMINATED < state) {
|
||||
/* if we were ordered to terminate, mark this proc as dead and see if
|
||||
* any of our routes or local children remain alive - if not, then
|
||||
* any of our routes or local children remain alive - if not, then
|
||||
* terminate ourselves. */
|
||||
if (orte_orteds_term_ordered) {
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
@ -440,6 +440,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
/* no need to alert the HNP - we are already on our way out */
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
keep_going:
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
@ -53,6 +53,7 @@
|
||||
|
||||
#include "orte/runtime/orte_cr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
|
||||
|
@ -35,6 +35,7 @@
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -38,6 +39,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -37,6 +38,8 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC
|
||||
* All rights reserved
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,6 +31,8 @@
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -29,6 +29,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,6 +27,8 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -20,6 +21,8 @@
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,6 +31,8 @@
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,6 +31,8 @@
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -432,17 +432,18 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_SNAPC_INIT_BAR, (void**)gidptr, )) {
|
||||
coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier);
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = jdata->jobid;
|
||||
nm->name.vpid = ORTE_VPID_WILDCARD;
|
||||
opal_list_append(&coll->participants, &nm->super);
|
||||
coll = orte_grpcomm_base_setup_collective(jdata->snapc_init_barrier);
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = jdata->jobid;
|
||||
nm->name.vpid = ORTE_VPID_WILDCARD;
|
||||
opal_list_append(&coll->participants, &nm->super);
|
||||
|
||||
coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier);
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = jdata->jobid;
|
||||
nm->name.vpid = ORTE_VPID_WILDCARD;
|
||||
opal_list_append(&coll->participants, &nm->super);
|
||||
coll = orte_grpcomm_base_setup_collective(jdata->snapc_fini_barrier);
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = jdata->jobid;
|
||||
nm->name.vpid = ORTE_VPID_WILDCARD;
|
||||
opal_list_append(&coll->participants, &nm->super);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* progress any pending collectives */
|
||||
@ -1407,7 +1408,12 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
rc = fork_local(app, child, app->env, jobdat);
|
||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||
if (ORTE_SUCCESS != (rc = fork_local(app, child, app->env, jobdat))) {
|
||||
orte_wait_cb_cancel(child);
|
||||
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
}
|
||||
/* if we indexed the argv, we need to restore it to
|
||||
* its original form
|
||||
*/
|
||||
@ -1454,21 +1460,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
|
||||
chdir(basedir);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:launch setting waitpids",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* setup the waitpids on the children that started */
|
||||
for (idx=0; idx < orte_local_children->size; idx++) {
|
||||
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
|
||||
continue;
|
||||
}
|
||||
if (child->name.jobid == jobdat->jobid &&
|
||||
ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
|
||||
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
GETOUT:
|
||||
/* tell the state machine that all local procs for this job
|
||||
* were launched so that it can do whatever it needs to do,
|
||||
@ -1744,40 +1735,18 @@ CLEANUP:
|
||||
* Wait for a callback indicating the child has completed.
|
||||
*/
|
||||
|
||||
void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
|
||||
{
|
||||
orte_proc_t *proc=NULL, *cptr;
|
||||
int i;
|
||||
orte_job_t *jobdat;
|
||||
orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
|
||||
char *abortfile, *jobfam, *job, *vpidstr;
|
||||
|
||||
/* find this child */
|
||||
for (i=0; i < orte_local_children->size; i++) {
|
||||
if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
|
||||
continue;
|
||||
}
|
||||
if (pid == cptr->pid) {
|
||||
proc = cptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == proc) {
|
||||
/* get here if we didn't find the child, or if the specified child
|
||||
* is already dead. If the latter, then we have a problem as it
|
||||
* means we are detecting it exiting multiple times
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:wait_local_proc did not find pid %ld in table!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(long)pid));
|
||||
return;
|
||||
}
|
||||
orte_proc_t *cptr;
|
||||
|
||||
opal_output_verbose(5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:wait_local_proc child process %s pid %ld terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), (long)pid);
|
||||
ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
|
||||
|
||||
/* if the child was previously flagged as dead, then just
|
||||
* ensure that its exit state gets reported to avoid hanging
|
||||
@ -1815,9 +1784,9 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
}
|
||||
|
||||
/* determine the state of this process */
|
||||
if (WIFEXITED(status)) {
|
||||
if (WIFEXITED(proc->exit_code)) {
|
||||
/* set the exit status appropriately */
|
||||
proc->exit_code = WEXITSTATUS(status);
|
||||
proc->exit_code = WEXITSTATUS(proc->exit_code);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:waitpid_fired child %s exit code %d",
|
||||
@ -1979,7 +1948,7 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
* the termination code to exit status translation the
|
||||
* same way
|
||||
*/
|
||||
proc->exit_code = WTERMSIG(status) + 128;
|
||||
proc->exit_code = WTERMSIG(proc->exit_code) + 128;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:waitpid_fired child process %s terminated with signal",
|
||||
@ -1992,6 +1961,23 @@ void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata)
|
||||
ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
orte_proc_t *child;
|
||||
orte_odls_base_kill_local_fn_t kill_local;
|
||||
} odls_kill_caddy_t;
|
||||
|
||||
static void kill_cbfunc(int fd, short args, void *cbdata)
|
||||
{
|
||||
odls_kill_caddy_t *cd = (odls_kill_caddy_t*)cbdata;
|
||||
|
||||
if (!ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_ALIVE) || 0 == cd->child->pid) {
|
||||
free(cd);
|
||||
return;
|
||||
}
|
||||
cd->kill_local(cd->child->pid, SIGKILL);
|
||||
free(cd);
|
||||
}
|
||||
|
||||
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
orte_odls_base_kill_local_fn_t kill_local,
|
||||
orte_odls_base_child_died_fn_t child_died)
|
||||
@ -2116,64 +2102,49 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
/* cancel the waitpid callback as this induces unmanageable race
|
||||
* conditions when we are deliberately killing the process
|
||||
*/
|
||||
orte_wait_cb_cancel(child->pid);
|
||||
orte_wait_cb_cancel(child);
|
||||
|
||||
/* First send a SIGCONT in case the process is in stopped state.
|
||||
If it is in a stopped state and we do not first change it to
|
||||
running, then SIGTERM will not get delivered. Ignore return
|
||||
value. */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING SIGCONT TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGCONT);
|
||||
if (!do_cleanup) {
|
||||
odls_kill_caddy_t *cd;
|
||||
|
||||
/* Send a sigterm to the process before sigkill to be nice */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING SIGTERM TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGTERM);
|
||||
|
||||
/* check to see if it died - the child_died function will continue
|
||||
* to check until we reach the timeout
|
||||
*
|
||||
* In practice, it doesn't matter what child_died reports
|
||||
* - we KILL the process anyway, to be sure it's dead.
|
||||
* However, what it does do is delay the KILL until either
|
||||
* the process is verified dead or the timeout elapsed,
|
||||
* which gives it time enough to shut down.
|
||||
*/
|
||||
if (!child_died(child)) {
|
||||
/* if it still isn't dead, try killing it one more time */
|
||||
/* if we are killing only selected procs, then do so in a gentle
|
||||
fashion. First send a SIGCONT in case the process is in stopped state.
|
||||
If it is in a stopped state and we do not first change it to
|
||||
running, then SIGTERM will not get delivered. Ignore return
|
||||
value. */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING SIGKILL TO %s",
|
||||
"%s SENDING SIGCONT TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
} else {
|
||||
/* Force the SIGKILL just to make sure things are dead
|
||||
* This fixes an issue that, if the application is masking
|
||||
* SIGTERM, then the child_died()
|
||||
* may return 'true' even though waipid returns with 0.
|
||||
* It does this to avoid a race condition, per documentation
|
||||
* in odls_default_module.c.
|
||||
*/
|
||||
kill_local(child->pid, SIGCONT);
|
||||
|
||||
/* Send a sigterm to the process before sigkill to be nice */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING FORCE SIGKILL TO %s",
|
||||
"%s SENDING SIGTERM TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGTERM);
|
||||
/* provide a polite delay so the proc has a chance to react */
|
||||
cd = (odls_kill_caddy_t*)malloc(sizeof(odls_kill_caddy_t));
|
||||
OBJ_RETAIN(child); // protect against race conditions
|
||||
cd->child = child;
|
||||
cd->kill_local = kill_local;
|
||||
ORTE_TIMER_EVENT(1, 0, kill_cbfunc, ORTE_SYS_PRI);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Force the SIGKILL just to make sure things are dead
|
||||
* This fixes an issue that, if the application is masking
|
||||
* SIGTERM, then the child_died()
|
||||
* may return 'true' even though waipid returns with 0.
|
||||
* It does this to avoid a race condition, per documentation
|
||||
* in odls_default_module.c.
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s SENDING FORCE SIGKILL TO %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
kill_local(child->pid, SIGKILL);
|
||||
/* Double check that it actually died this time */
|
||||
if (!child_died(child)) {
|
||||
orte_show_help("help-orte-odls-base.txt",
|
||||
"orte-odls-base:could-not-kill",
|
||||
true, orte_process_info.nodename, child->pid);
|
||||
} else
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
|
||||
"%s odls:kill_local_proc child %s killed",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&child->name)));
|
||||
|
||||
/* indicate the waitpid fired as this is effectively what
|
||||
* has happened
|
||||
@ -2312,9 +2283,11 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
|
||||
"%s restarting app %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
|
||||
|
||||
rc = fork_local(app, child, app->env, jobdat);
|
||||
if (ORTE_SUCCESS == rc) {
|
||||
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
|
||||
orte_wait_cb(child, odls_base_default_wait_local_proc, NULL);
|
||||
if (ORTE_SUCCESS != (rc = fork_local(app, child, app->env, jobdat))) {
|
||||
orte_wait_cb_cancel(child);
|
||||
child->exit_code = ORTE_ERR_SILENT; /* error message already output */
|
||||
ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
|
@ -107,7 +107,7 @@ ORTE_DECLSPEC void orte_odls_base_default_launch_local(int fd, short sd, void *c
|
||||
ORTE_DECLSPEC int
|
||||
orte_odls_base_default_deliver_message(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag);
|
||||
|
||||
ORTE_DECLSPEC void odls_base_default_wait_local_proc(pid_t pid, int status, void* cbdata);
|
||||
ORTE_DECLSPEC void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata);
|
||||
|
||||
/* define a function type to signal a local proc */
|
||||
typedef int (*orte_odls_base_signal_local_fn_t)(pid_t pid, int signum);
|
||||
|
@ -178,13 +178,13 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
* the default 1s actually means 'somwhere between 0 and 1s'. */
|
||||
end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
|
||||
do {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid),
|
||||
orte_odls_globals.timeout_before_sigkill + 1));
|
||||
ret = waitpid(child->pid, &child->exit_code, WNOHANG);
|
||||
if (child->pid == ret) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PROC %d IS DEAD",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* It died -- return success */
|
||||
@ -204,14 +204,14 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* Do nothing, process still alive */
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
enough for government work" */
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
return true;
|
||||
@ -693,7 +693,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
if (pid == 0) {
|
||||
close(p[0]);
|
||||
#if HAVE_SETPGID
|
||||
setpgid(0, 0);
|
||||
// setpgid(0, 0);
|
||||
#endif
|
||||
do_child(context, child, environ_copy, jobdat, p[1], opts);
|
||||
/* Does not return */
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -36,6 +36,7 @@
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
|
@ -256,11 +256,10 @@ static int rsh_init(void)
|
||||
/**
|
||||
* Callback on daemon exit.
|
||||
*/
|
||||
static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata;
|
||||
orte_proc_t *daemon=caddy->daemon;
|
||||
|
||||
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
|
||||
/* ignore any such report - it will occur if we left the
|
||||
@ -270,7 +269,8 @@ static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
|
||||
if (! WIFEXITED(daemon->exit_code) ||
|
||||
! WEXITSTATUS(daemon->exit_code) == 0) { /* if abnormal exit */
|
||||
/* if we are not the HNP, send a message to the HNP alerting it
|
||||
* to the failure
|
||||
*/
|
||||
@ -279,10 +279,10 @@ static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s daemon %d failed with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)daemon->name.vpid, WEXITSTATUS(status)));
|
||||
(int)daemon->name.vpid, WEXITSTATUS(daemon->exit_code)));
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID);
|
||||
opal_dss.pack(buf, &status, 1, OPAL_INT);
|
||||
opal_dss.pack(buf, &daemon->exit_code, 1, OPAL_INT);
|
||||
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
|
||||
ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
|
||||
orte_rml_send_callback, NULL);
|
||||
@ -294,9 +294,9 @@ static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s daemon %d failed with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)daemon->name.vpid, WEXITSTATUS(status)));
|
||||
(int)daemon->name.vpid, WEXITSTATUS(daemon->exit_code)));
|
||||
/* set the exit status */
|
||||
ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
|
||||
ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(daemon->exit_code));
|
||||
/* note that this daemon failed */
|
||||
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
/* increment the #daemons terminated so we will exit properly */
|
||||
@ -920,11 +920,14 @@ static void process_launch_list(int fd, short args, void *cbdata)
|
||||
break;
|
||||
}
|
||||
caddy = (orte_plm_rsh_caddy_t*)item;
|
||||
|
||||
/* register the sigchild callback */
|
||||
orte_wait_cb(caddy->daemon, rsh_wait_daemon, (void*)caddy);
|
||||
|
||||
/* fork a child to exec the rsh/ssh session */
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
|
||||
orte_wait_cb_cancel(caddy->daemon);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -941,12 +944,7 @@ static void process_launch_list(int fd, short args, void *cbdata)
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:rsh: recording launch of daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(caddy->daemon->name))));
|
||||
|
||||
/* setup callback on sigchild - wait until setup above is complete
|
||||
* as the callback can occur in the call to orte_wait_cb
|
||||
*/
|
||||
orte_wait_cb(pid, rsh_wait_daemon, (void*)caddy);
|
||||
ORTE_NAME_PRINT(&(caddy->daemon->name))));
|
||||
num_in_progress++;
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,8 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -18,6 +19,7 @@
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/mca_base_component_repository.h"
|
||||
#include "opal/util/output.h"
|
||||
|
@ -4,6 +4,7 @@
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -14,7 +15,7 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -8,8 +8,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -49,6 +49,7 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
|
@ -20,6 +20,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
|
@ -20,6 +20,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/dfs/dfs.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -96,6 +96,7 @@
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/orted/orted.h"
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||
|
@ -53,7 +53,6 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -107,6 +106,15 @@ ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
|
||||
/* error manager callback function */
|
||||
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
|
||||
|
||||
/* define an object for timer events */
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
struct timeval tv;
|
||||
opal_event_t *ev;
|
||||
void *payload;
|
||||
} orte_timer_t;
|
||||
OBJ_CLASS_DECLARATION(orte_timer_t);
|
||||
|
||||
ORTE_DECLSPEC extern int orte_exit_status;
|
||||
|
||||
/* ORTE event priorities - we define these
|
||||
|
@ -13,6 +13,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Institut National de Recherche en Informatique
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -65,11 +66,7 @@
|
||||
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Timer Object Declaration
|
||||
*
|
||||
********************************************************************/
|
||||
/* Timer Object Declaration */
|
||||
static void timer_const(orte_timer_t *tm)
|
||||
{
|
||||
tm->ev = opal_event_alloc();
|
||||
@ -84,145 +81,57 @@ OBJ_CLASS_INSTANCE(orte_timer_t,
|
||||
timer_const,
|
||||
timer_dest);
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Wait Object Declarations
|
||||
*
|
||||
********************************************************************/
|
||||
#ifdef HAVE_WAITPID
|
||||
|
||||
static volatile int cb_enabled = true;
|
||||
static opal_mutex_t mutex;
|
||||
static opal_list_t pending_pids;
|
||||
static opal_list_t registered_cb;
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Local Class Declarations
|
||||
*
|
||||
********************************************************************/
|
||||
struct blk_waitpid_data_t {
|
||||
opal_object_t super;
|
||||
opal_condition_t *cond;
|
||||
volatile int done;
|
||||
volatile int status;
|
||||
volatile int free;
|
||||
};
|
||||
typedef struct blk_waitpid_data_t blk_waitpid_data_t;
|
||||
|
||||
struct pending_pids_item_t {
|
||||
/* Local objects */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
pid_t pid;
|
||||
int status;
|
||||
};
|
||||
typedef struct pending_pids_item_t pending_pids_item_t;
|
||||
|
||||
struct registered_cb_item_t {
|
||||
opal_list_item_t super;
|
||||
pid_t pid;
|
||||
orte_wait_fn_t callback;
|
||||
void *data;
|
||||
};
|
||||
typedef struct registered_cb_item_t registered_cb_item_t;
|
||||
|
||||
struct waitpid_callback_data_t {
|
||||
pid_t pid;
|
||||
int status;
|
||||
int options;
|
||||
pid_t ret;
|
||||
opal_mutex_t mutex;
|
||||
opal_condition_t cond;
|
||||
volatile bool done;
|
||||
};
|
||||
typedef struct waitpid_callback_data_t waitpid_callback_data_t;
|
||||
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Local Class Definitions
|
||||
*
|
||||
********************************************************************/
|
||||
static void
|
||||
blk_waitpid_data_construct(opal_object_t *obj)
|
||||
opal_event_t ev;
|
||||
orte_proc_t *child;
|
||||
orte_wait_fn_t cbfunc;
|
||||
void *cbdata;
|
||||
} orte_wait_tracker_t;
|
||||
static void wccon(orte_wait_tracker_t *p)
|
||||
{
|
||||
blk_waitpid_data_t *data = (blk_waitpid_data_t*) obj;
|
||||
|
||||
data->cond = OBJ_NEW(opal_condition_t);
|
||||
data->done = 0;
|
||||
data->status = 0;
|
||||
data->free = 0;
|
||||
p->child = NULL;
|
||||
p->cbfunc = NULL;
|
||||
p->cbdata = NULL;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
blk_waitpid_data_destruct(opal_object_t *obj)
|
||||
static void wcdes(orte_wait_tracker_t *p)
|
||||
{
|
||||
blk_waitpid_data_t *data = (blk_waitpid_data_t*) obj;
|
||||
|
||||
if (NULL != data->cond) OBJ_RELEASE(data->cond);
|
||||
if (NULL != p->child) {
|
||||
OBJ_RELEASE(p->child);
|
||||
}
|
||||
}
|
||||
static OBJ_CLASS_INSTANCE(orte_wait_tracker_t,
|
||||
opal_list_item_t,
|
||||
wccon, wcdes);
|
||||
|
||||
|
||||
static OBJ_CLASS_INSTANCE(blk_waitpid_data_t, opal_object_t,
|
||||
blk_waitpid_data_construct,
|
||||
blk_waitpid_data_destruct);
|
||||
|
||||
static OBJ_CLASS_INSTANCE(pending_pids_item_t, opal_list_item_t, NULL, NULL);
|
||||
|
||||
static OBJ_CLASS_INSTANCE(registered_cb_item_t, opal_list_item_t, NULL, NULL);
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Local Variables
|
||||
*
|
||||
********************************************************************/
|
||||
/* Local Variables */
|
||||
static opal_event_t handler;
|
||||
static volatile int cb_enabled = true;
|
||||
static opal_list_t pending_cbs;
|
||||
|
||||
/* Local Function Prototypes */
|
||||
static void wait_signal_callback(int fd, short event, void *arg);
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Local Function Prototypes
|
||||
*
|
||||
********************************************************************/
|
||||
static void blk_waitpid_cb(pid_t wpid, int status, void *data);
|
||||
static pending_pids_item_t* find_pending_pid(pid_t pid, bool create);
|
||||
static registered_cb_item_t* find_waiting_cb(pid_t pid, bool create);
|
||||
static void do_waitall(int options);
|
||||
static void trigger_callback(registered_cb_item_t *cb,
|
||||
pending_pids_item_t *pending);
|
||||
static int register_callback(pid_t pid, orte_wait_fn_t callback,
|
||||
void *data);
|
||||
static int unregister_callback(pid_t pid);
|
||||
void orte_wait_signal_callback(int fd, short event, void *arg);
|
||||
/* Interface Functions */
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Interface Functions
|
||||
*
|
||||
********************************************************************/
|
||||
|
||||
void
|
||||
orte_wait_disable(void)
|
||||
void orte_wait_disable(void)
|
||||
{
|
||||
opal_event_del(&handler);
|
||||
}
|
||||
|
||||
void
|
||||
orte_wait_enable(void)
|
||||
void orte_wait_enable(void)
|
||||
{
|
||||
opal_event_add(&handler, NULL);
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_init(void)
|
||||
int orte_wait_init(void)
|
||||
{
|
||||
OBJ_CONSTRUCT(&mutex, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&pending_pids, opal_list_t);
|
||||
OBJ_CONSTRUCT(®istered_cb, opal_list_t);
|
||||
OBJ_CONSTRUCT(&pending_cbs, opal_list_t);
|
||||
|
||||
opal_event_set(orte_event_base,
|
||||
&handler, SIGCHLD, OPAL_EV_SIGNAL|OPAL_EV_PERSIST,
|
||||
orte_wait_signal_callback,
|
||||
wait_signal_callback,
|
||||
&handler);
|
||||
opal_event_set_priority(&handler, ORTE_SYS_PRI);
|
||||
|
||||
@ -231,434 +140,141 @@ orte_wait_init(void)
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_wait_finalize(void)
|
||||
int orte_wait_finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
opal_event_del(&handler);
|
||||
|
||||
/* clear out the lists */
|
||||
while (NULL != (item = opal_list_remove_first(&pending_pids))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(®istered_cb))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
|
||||
OBJ_DESTRUCT(&mutex);
|
||||
OBJ_DESTRUCT(&pending_pids);
|
||||
OBJ_DESTRUCT(®istered_cb);
|
||||
/* clear out the pending cbs */
|
||||
OPAL_LIST_DESTRUCT(&pending_cbs);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_kill(int sig)
|
||||
static void register_callback(int fd, short args, void *cbdata)
|
||||
{
|
||||
opal_list_item_t* item;
|
||||
orte_wait_tracker_t *trk = (orte_wait_tracker_t*)cbdata;
|
||||
orte_wait_tracker_t *t2;
|
||||
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
do_waitall(0);
|
||||
while (NULL != (item = opal_list_remove_first(®istered_cb))) {
|
||||
registered_cb_item_t *cb = (registered_cb_item_t*)item;
|
||||
pending_pids_item_t *pending = find_pending_pid(cb->pid,false);
|
||||
if(NULL == pending) {
|
||||
int status;
|
||||
kill(cb->pid, sig);
|
||||
waitpid(cb->pid,&status,0);
|
||||
} else {
|
||||
OBJ_RELEASE(pending);
|
||||
/* see if this proc is still alive */
|
||||
if (!ORTE_FLAG_TEST(trk->child, ORTE_PROC_FLAG_ALIVE)) {
|
||||
/* already heard this proc is dead, so just do the callback */
|
||||
if (NULL != trk->cbfunc) {
|
||||
trk->cbfunc(trk->child, trk->cbdata);
|
||||
OBJ_RELEASE(trk);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
pid_t
|
||||
orte_waitpid(pid_t wpid, int *status, int options)
|
||||
{
|
||||
pending_pids_item_t *pending = NULL;
|
||||
blk_waitpid_data_t *data = NULL;
|
||||
struct timespec spintime;
|
||||
pid_t ret;
|
||||
|
||||
if ((wpid <= 0) || (0 != (options & WUNTRACED))) {
|
||||
errno = ORTE_ERR_NOT_IMPLEMENTED;
|
||||
return (pid_t) -1;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
|
||||
do_waitall(options);
|
||||
pending = find_pending_pid(wpid, false);
|
||||
if (NULL != pending) {
|
||||
*status = pending->status;
|
||||
ret = pending->pid;
|
||||
opal_list_remove_item(&pending_pids, (opal_list_item_t*) pending);
|
||||
OBJ_RELEASE(pending);
|
||||
goto cleanup;
|
||||
/* we just override any existing registration */
|
||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||
if (t2->child == trk->child) {
|
||||
t2->cbfunc = trk->cbfunc;
|
||||
t2->cbdata = trk->cbdata;
|
||||
OBJ_RELEASE(trk);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == (options & WNOHANG)) {
|
||||
/* blocking - create a blk_waitpid_data_t, register the
|
||||
callback with it, and wait for the trigger. Hold mutex
|
||||
until after we register so that waitpid isn't called before
|
||||
the callback is registered. There is a race condition
|
||||
between starting to sit in the condition_wait and the
|
||||
callback being triggered, so poll for completion on the
|
||||
event just in case. */
|
||||
data = OBJ_NEW(blk_waitpid_data_t);
|
||||
if (NULL == data) {
|
||||
ret = -1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
register_callback(wpid, blk_waitpid_cb, data);
|
||||
|
||||
while (0 == data->done) {
|
||||
spintime.tv_sec = 0;
|
||||
spintime.tv_nsec = 1 * 1000 * 1000; /* 1 milliseconds */
|
||||
opal_condition_timedwait(data->cond,
|
||||
&mutex,
|
||||
&spintime);
|
||||
|
||||
/* if we have pthreads and progress threads and we are the
|
||||
event thread, opal_condition_timedwait won't progress
|
||||
anything, so we need to do it. */
|
||||
if (opal_using_threads()) {
|
||||
opal_mutex_unlock(&mutex);
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_NONBLOCK);
|
||||
opal_mutex_lock(&mutex);
|
||||
}
|
||||
do_waitall(0);
|
||||
}
|
||||
|
||||
ret = wpid;
|
||||
*status = data->status;
|
||||
|
||||
/* Unlock the mutex first, so as to not cause any deadlocks.
|
||||
We aren't going to touch any variables that could cause
|
||||
problems with thread badness, so it's ok to be here without
|
||||
the thread locked. Wich is also the reason we go to done
|
||||
instead of cleanup. */
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
|
||||
while (0 == data->free) {
|
||||
/* don't free the condition variable until we are positive
|
||||
that the broadcast is done being sent. Otherwise,
|
||||
pthreads gets really unhappy when we pull the rug out
|
||||
from under it. Yes, it's spinning. No, we won't spin
|
||||
for long. */
|
||||
|
||||
if (!OPAL_ENABLE_MULTI_THREADS) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_NONBLOCK);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(data);
|
||||
/* see note above while loop for why we jump to done */
|
||||
goto done;
|
||||
|
||||
} else {
|
||||
/* non-blocking - return what waitpid would */
|
||||
ret = waitpid(wpid, status, options);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
|
||||
done:
|
||||
return ret;
|
||||
/* get here if this is a new registration */
|
||||
opal_list_append(&pending_cbs, &trk->super);
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_cb(pid_t wpid, orte_wait_fn_t callback, void *data)
|
||||
void orte_wait_cb(orte_proc_t *child, orte_wait_fn_t callback, void *data)
|
||||
{
|
||||
int ret;
|
||||
orte_wait_tracker_t *trk;
|
||||
|
||||
if (wpid <= 0) return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
if (NULL == callback) return ORTE_ERR_BAD_PARAM;
|
||||
if (NULL == child || NULL == callback) {
|
||||
/* bozo protection */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
ret = register_callback(wpid, callback, data);
|
||||
do_waitall(0);
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
|
||||
return ret;
|
||||
/* push this into the event library for handling */
|
||||
trk = OBJ_NEW(orte_wait_tracker_t);
|
||||
OBJ_RETAIN(child); // protect against race conditions
|
||||
trk->child = child;
|
||||
trk->cbfunc = callback;
|
||||
trk->cbdata = data;
|
||||
opal_event_set(orte_event_base, &trk->ev, -1, OPAL_EV_WRITE, register_callback, trk);
|
||||
opal_event_set_priority(&trk->ev, ORTE_SYS_PRI);
|
||||
opal_event_active(&trk->ev, OPAL_EV_WRITE, 1);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_wait_cb_cancel(pid_t wpid)
|
||||
static void cancel_callback(int fd, short args, void *cbdata)
|
||||
{
|
||||
int ret;
|
||||
orte_wait_tracker_t *trk = (orte_wait_tracker_t*)cbdata;
|
||||
orte_wait_tracker_t *t2;
|
||||
|
||||
if (wpid <= 0) return ORTE_ERR_BAD_PARAM;
|
||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||
if (t2->child == trk->child) {
|
||||
opal_list_remove_item(&pending_cbs, &t2->super);
|
||||
OBJ_RELEASE(t2);
|
||||
OBJ_RELEASE(trk);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
do_waitall(0);
|
||||
ret = unregister_callback(wpid);
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
OBJ_RELEASE(trk);
|
||||
}
|
||||
|
||||
return ret;
|
||||
void orte_wait_cb_cancel(orte_proc_t *child)
|
||||
{
|
||||
orte_wait_tracker_t *trk;
|
||||
|
||||
if (NULL == child) {
|
||||
/* bozo protection */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return;
|
||||
}
|
||||
|
||||
/* push this into the event library for handling */
|
||||
trk = OBJ_NEW(orte_wait_tracker_t);
|
||||
OBJ_RETAIN(child); // protect against race conditions
|
||||
trk->child = child;
|
||||
opal_event_set(orte_event_base, &trk->ev, -1, OPAL_EV_WRITE, cancel_callback, trk);
|
||||
opal_event_set_priority(&trk->ev, ORTE_SYS_PRI);
|
||||
opal_event_active(&trk->ev, OPAL_EV_WRITE, 1);
|
||||
}
|
||||
|
||||
|
||||
/* callback from the event library whenever a SIGCHLD is received */
|
||||
void
|
||||
orte_wait_signal_callback(int fd, short event, void *arg)
|
||||
static void wait_signal_callback(int fd, short event, void *arg)
|
||||
{
|
||||
opal_event_t *signal = (opal_event_t*) arg;
|
||||
int status;
|
||||
pid_t pid;
|
||||
orte_wait_tracker_t *t2;
|
||||
|
||||
if (SIGCHLD != OPAL_EVENT_SIGNAL(signal)) return;
|
||||
if (SIGCHLD != OPAL_EVENT_SIGNAL(signal)) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
do_waitall(0);
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
}
|
||||
/* retrieve the pid */
|
||||
retry:
|
||||
pid = waitpid(-1, &status, WNOHANG);
|
||||
if (-1 == pid && EINTR == errno) {
|
||||
/* try it again */
|
||||
goto retry;
|
||||
}
|
||||
/* if we got garbage, then nothing we can do */
|
||||
if (pid <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_wait_cb_disable()
|
||||
{
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
do_waitall(0);
|
||||
cb_enabled = false;
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_wait_cb_enable()
|
||||
{
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
cb_enabled = true;
|
||||
do_waitall(0);
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Local Functions
|
||||
*
|
||||
* None of these functions should lock mutex. All but blk_waitpid_cb
|
||||
* should only be called if the mutex is already locked.
|
||||
*
|
||||
********************************************************************/
|
||||
static void
|
||||
blk_waitpid_cb(pid_t wpid, int status, void *data)
|
||||
{
|
||||
blk_waitpid_data_t *wp_data = (blk_waitpid_data_t*) data;
|
||||
|
||||
wp_data->status = status;
|
||||
wp_data->done = 1;
|
||||
opal_condition_signal(wp_data->cond);
|
||||
wp_data->free = 1;
|
||||
}
|
||||
|
||||
|
||||
/* -1 will return the first available pid */
|
||||
static pending_pids_item_t *
|
||||
find_pending_pid(pid_t pid, bool create)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
pending_pids_item_t *pending;
|
||||
|
||||
for (item = opal_list_get_first(&pending_pids) ;
|
||||
item != opal_list_get_end(&pending_pids) ;
|
||||
item = opal_list_get_next(item)) {
|
||||
pending = (pending_pids_item_t*) item;
|
||||
|
||||
if (pending->pid == pid || -1 == pid) {
|
||||
return pending;
|
||||
/* we are already in an event, so it is safe to access the list */
|
||||
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
|
||||
if (pid == t2->child->pid) {
|
||||
/* found it! */
|
||||
t2->child->exit_code = status;
|
||||
if (NULL != t2->cbfunc) {
|
||||
t2->cbfunc(t2->child, t2->cbdata);
|
||||
}
|
||||
opal_list_remove_item(&pending_cbs, &t2->super);
|
||||
OBJ_RELEASE(t2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (create) {
|
||||
pending = OBJ_NEW(pending_pids_item_t);
|
||||
if (NULL == pending) return NULL;
|
||||
|
||||
pending->pid = pid;
|
||||
pending->status = 0;
|
||||
opal_list_append(&pending_pids, (opal_list_item_t*) pending);
|
||||
return pending;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
/* if we get here, then this sigchild occurred prior to someone
|
||||
* registering it, or after someone mistakenly removed it. Either
|
||||
* way, there really isn't anything we can do with it */
|
||||
}
|
||||
|
||||
|
||||
/* pid must be positive */
|
||||
static registered_cb_item_t *
|
||||
find_waiting_cb(pid_t pid, bool create)
|
||||
{
|
||||
opal_list_item_t *item = NULL;
|
||||
registered_cb_item_t *reg_cb = NULL;
|
||||
|
||||
for (item = opal_list_get_first(®istered_cb) ;
|
||||
item != opal_list_get_end(®istered_cb) ;
|
||||
item = opal_list_get_next(item)) {
|
||||
reg_cb = (registered_cb_item_t*) item;
|
||||
|
||||
if (reg_cb->pid == pid) {
|
||||
return reg_cb;
|
||||
}
|
||||
}
|
||||
|
||||
if (create) {
|
||||
reg_cb = OBJ_NEW(registered_cb_item_t);
|
||||
if (NULL == reg_cb) return NULL;
|
||||
|
||||
reg_cb->pid = pid;
|
||||
reg_cb->callback = NULL;
|
||||
reg_cb->data = NULL;
|
||||
opal_list_append(®istered_cb, (opal_list_item_t*) reg_cb);
|
||||
return reg_cb;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
do_waitall(int options)
|
||||
{
|
||||
if (!cb_enabled) return;
|
||||
while (1) {
|
||||
int status;
|
||||
pid_t ret = waitpid(-1, &status, WNOHANG);
|
||||
pending_pids_item_t *pending;
|
||||
registered_cb_item_t *cb;
|
||||
|
||||
if (-1 == ret && EINTR == errno) continue;
|
||||
if (ret <= 0) break;
|
||||
|
||||
cb = find_waiting_cb(ret, false);
|
||||
if (NULL == cb) {
|
||||
pending = OBJ_NEW(pending_pids_item_t);
|
||||
pending->pid = ret;
|
||||
pending->status = status;
|
||||
opal_list_append(&pending_pids, &pending->super);
|
||||
} else {
|
||||
opal_list_remove_item(®istered_cb, &cb->super);
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
cb->callback(cb->pid, status, cb->data);
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
OBJ_RELEASE(cb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
trigger_callback(registered_cb_item_t *cb, pending_pids_item_t *pending)
|
||||
{
|
||||
assert(cb->pid == pending->pid);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&mutex);
|
||||
cb->callback(cb->pid, pending->status, cb->data);
|
||||
OPAL_THREAD_LOCK(&mutex);
|
||||
|
||||
opal_list_remove_item(&pending_pids, (opal_list_item_t*) pending);
|
||||
opal_list_remove_item(®istered_cb, (opal_list_item_t*) cb);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
register_callback(pid_t pid, orte_wait_fn_t callback, void *data)
|
||||
{
|
||||
registered_cb_item_t *reg_cb;
|
||||
pending_pids_item_t *pending;
|
||||
|
||||
/* register the callback */
|
||||
reg_cb = find_waiting_cb(pid, true);
|
||||
if (NULL == reg_cb) return ORTE_ERROR;
|
||||
if (NULL != reg_cb->callback) return ORTE_EXISTS;
|
||||
|
||||
reg_cb->callback = callback;
|
||||
reg_cb->data = data;
|
||||
|
||||
/* make sure we shouldn't trigger right now */
|
||||
pending = find_pending_pid(pid, false);
|
||||
if (NULL != pending) {
|
||||
trigger_callback(reg_cb, pending);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
unregister_callback(pid_t pid)
|
||||
{
|
||||
registered_cb_item_t *reg_cb;
|
||||
|
||||
/* register the callback */
|
||||
reg_cb = find_waiting_cb(pid, false);
|
||||
if (NULL == reg_cb) return ORTE_ERR_BAD_PARAM;
|
||||
|
||||
opal_list_remove_item(®istered_cb, (opal_list_item_t*) reg_cb);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#else /* no waitpid */
|
||||
|
||||
int
|
||||
orte_wait_init(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
pid_t
|
||||
orte_waitpid(pid_t wpid, int *status, int options)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_cb(pid_t wpid, orte_wait_fn_t callback, void *data)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_cb_cancel(pid_t wpid)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_cb_disable(void)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_cb_enable(void)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
int
|
||||
orte_wait_kill(int sig)
|
||||
{
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -13,6 +13,7 @@
|
||||
* et Automatique. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -45,16 +46,15 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/** typedef for callback function used in \c ompi_rte_wait_cb */
|
||||
typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data);
|
||||
/** typedef for callback function used in \c orte_wait_cb */
|
||||
typedef void (*orte_wait_fn_t)(orte_proc_t *proc, void *data);
|
||||
|
||||
/**
|
||||
* Disable / re-Enable SIGCHLD handler
|
||||
@ -65,51 +65,17 @@ typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data);
|
||||
ORTE_DECLSPEC void orte_wait_enable(void);
|
||||
ORTE_DECLSPEC void orte_wait_disable(void);
|
||||
|
||||
/**
|
||||
* Wait for process terminiation
|
||||
*
|
||||
* Similar to \c waitpid, \c orte_waitpid utilizes the run-time
|
||||
* event library for process terminiation notification. The \c
|
||||
* WUNTRACED option is not supported, but the \c WNOHANG option is
|
||||
* supported.
|
||||
*
|
||||
* \note A \c wpid value of \c -1 is not currently supported and will
|
||||
* return an error.
|
||||
*/
|
||||
ORTE_DECLSPEC pid_t orte_waitpid(pid_t wpid, int *status, int options);
|
||||
|
||||
|
||||
/**
|
||||
* Register a callback for process termination
|
||||
*
|
||||
* Register a callback for notification when \c wpid causes a SIGCHLD.
|
||||
* Register a callback for notification when this process causes a SIGCHLD.
|
||||
* \c waitpid() will have already been called on the process at this
|
||||
* time.
|
||||
*
|
||||
* If a thread is already blocked in \c ompi_rte_waitpid for \c wpid,
|
||||
* this function will return \c ORTE_ERR_EXISTS. It is illegal for
|
||||
* multiple callbacks to be registered for a single \c wpid
|
||||
* (OMPI_EXISTS will be returned in this case).
|
||||
*
|
||||
* \warning It is not legal for \c wpid to be -1 when registering a
|
||||
* callback.
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_wait_cb(pid_t wpid, orte_wait_fn_t callback, void *data);
|
||||
ORTE_DECLSPEC void orte_wait_cb(orte_proc_t *proc, orte_wait_fn_t callback, void *data);
|
||||
|
||||
ORTE_DECLSPEC int orte_wait_cb_cancel(pid_t wpid);
|
||||
ORTE_DECLSPEC void orte_wait_cb_cancel(orte_proc_t *proc);
|
||||
|
||||
ORTE_DECLSPEC int orte_wait_cb_disable(void);
|
||||
|
||||
ORTE_DECLSPEC int orte_wait_cb_enable(void);
|
||||
|
||||
/* define an object for timer events */
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
struct timeval tv;
|
||||
opal_event_t *ev;
|
||||
void *payload;
|
||||
} orte_timer_t;
|
||||
OBJ_CLASS_DECLARATION(orte_timer_t);
|
||||
|
||||
/* In a few places, we need to barrier until something happens
|
||||
* that changes a flag to indicate we can release - e.g., waiting
|
||||
@ -208,11 +174,6 @@ OBJ_CLASS_DECLARATION(orte_timer_t);
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_wait_init(void);
|
||||
|
||||
/**
|
||||
* Kill all processes we are waiting on.
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_wait_kill(int sig);
|
||||
|
||||
/**
|
||||
* \internal
|
||||
*
|
||||
|
@ -11,6 +11,8 @@
|
||||
#include "orte/types.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/util/attr.h"
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user