Ensure that the errmgr does not take action if the process was terminated by a "kill_procs" command as this can lead to circular logic.
Cleanup the kill_procs command by removing a no-longer-used param. We update the process state when the proc actually exits. This commit was SVN r22783.
Этот коммит содержится в:
родитель
ef6c432e22
Коммит
f2c65dc70f
@ -79,7 +79,7 @@ void orte_errmgr_base_error_abort(int error_code, char *fmt, ...)
|
||||
/* if I am a daemon or the HNP... */
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||
/* whack my local procs */
|
||||
orte_odls.kill_local_procs(NULL, false);
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
/* whack any session directories */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
} else {
|
||||
|
@ -46,6 +46,7 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *proc;
|
||||
int i;
|
||||
|
||||
/* get the job data object for this process */
|
||||
@ -54,6 +55,16 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
|
||||
goto PROCESS;
|
||||
}
|
||||
|
||||
/* if the proc was terminated by cmd, ignore it */
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) {
|
||||
/* nothing we can do */
|
||||
goto PROCESS;
|
||||
}
|
||||
if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
|
||||
/* don't do anything or else we can enter an infinite loop */
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_ABORTED & jdata->err_cbstates)) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
|
||||
"%s errmgr:default: proc %s aborted with status %d - calling cbfunc",
|
||||
|
@ -2265,7 +2265,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
orte_odls_child_t *child;
|
||||
orte_std_cntr_t cnt;
|
||||
int rc;
|
||||
bool found=false;
|
||||
bool found=false, registering=false;
|
||||
int8_t flag;
|
||||
orte_odls_job_t *jobdat, *jdat;
|
||||
|
||||
@ -2320,6 +2320,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
* unpack the contact info from the buffer and store it
|
||||
*/
|
||||
child->init_recvd = true;
|
||||
registering = true;
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &(child->rml_uri), &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -2386,6 +2387,11 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
}
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
/* if we are deregistering, then we are done */
|
||||
if (!registering) {
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* now check to see if everyone in this job has registered */
|
||||
if (all_children_registered(proc->jobid)) {
|
||||
/* once everyone registers, send their contact info to
|
||||
@ -2948,7 +2954,7 @@ CLEANUP:
|
||||
return;
|
||||
}
|
||||
|
||||
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state,
|
||||
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
orte_odls_base_kill_local_fn_t kill_local,
|
||||
orte_odls_base_child_died_fn_t child_died)
|
||||
{
|
||||
|
@ -128,7 +128,7 @@ typedef int (*orte_odls_base_kill_local_fn_t)(pid_t pid, int signum);
|
||||
typedef bool (*orte_odls_base_child_died_fn_t)(pid_t pid, unsigned int timeout, int *exit_status);
|
||||
|
||||
ORTE_DECLSPEC int
|
||||
orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state,
|
||||
orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
orte_odls_base_kill_local_fn_t kill_local,
|
||||
orte_odls_base_child_died_fn_t child_died);
|
||||
|
||||
|
@ -88,7 +88,7 @@
|
||||
* External Interface
|
||||
*/
|
||||
static int orte_odls_default_launch_local_procs(opal_buffer_t *data);
|
||||
static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state);
|
||||
static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs);
|
||||
static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
|
||||
|
||||
static void set_handler_default(int sig);
|
||||
@ -200,11 +200,11 @@ static int odls_default_kill_local(pid_t pid, int signum)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state)
|
||||
int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, set_state,
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
|
||||
odls_default_kill_local, odls_default_child_died))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
|
@ -62,7 +62,7 @@ typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(opal_buffer_t *
|
||||
/**
|
||||
* Kill the local processes on this node
|
||||
*/
|
||||
typedef int (*orte_odls_base_module_kill_local_processes_fn_t)(opal_pointer_array_t *procs, bool set_state);
|
||||
typedef int (*orte_odls_base_module_kill_local_processes_fn_t)(opal_pointer_array_t *procs);
|
||||
|
||||
/**
|
||||
* Signal local processes
|
||||
|
@ -74,11 +74,11 @@ static int odls_process_kill_local( pid_t pid, int sig_num )
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int odls_process_kill_local_procs(opal_pointer_array_t *procs, bool set_state)
|
||||
static int odls_process_kill_local_procs(opal_pointer_array_t *procs)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, set_state,
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
|
||||
odls_process_kill_local, odls_process_child_died))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
|
@ -201,10 +201,8 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
|
||||
opal_output(0, "%s ORTED_CMD_PROCESSOR: STUCK IN INFINITE LOOP - ABORTING",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
OBJ_RELEASE(mev);
|
||||
/* make sure our local procs are dead - but don't update their state
|
||||
* on the HNP as this may be redundant
|
||||
*/
|
||||
orte_odls.kill_local_procs(NULL, false);
|
||||
/* make sure our local procs are dead */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
||||
/* do -not- call finalize as this will send a message to the HNP
|
||||
* indicating clean termination! Instead, just forcibly cleanup
|
||||
@ -407,7 +405,7 @@ static int process_commands(orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
/* kill the procs */
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(&procarray, true))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(&procarray))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
@ -625,7 +623,7 @@ static int process_commands(orte_process_name_t* sender,
|
||||
}
|
||||
/* if we are the HNP, just kill our local procs */
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
orte_odls.kill_local_procs(NULL, false);
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -865,10 +865,8 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
||||
unlink(log_path);
|
||||
}
|
||||
|
||||
/* make sure our local procs are dead - but don't update their state
|
||||
* on the HNP as this may be redundant
|
||||
*/
|
||||
orte_odls.kill_local_procs(NULL, false);
|
||||
/* make sure our local procs are dead */
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
@ -1169,7 +1169,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
|
||||
if (forcibly_die) {
|
||||
/* kill any local procs */
|
||||
orte_odls.kill_local_procs(NULL, false);
|
||||
orte_odls.kill_local_procs(NULL);
|
||||
|
||||
/* whack any lingering session directory files from our jobs */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user