1
1

Ensure that the errmgr does not take action if the process was terminated by a "kill_procs" command as this can lead to circular logic.

Cleanup the kill_procs command by removing a no-longer-used param. We update the process state when the proc actually exits.

This commit was SVN r22783.
Этот коммит содержится в:
Ralph Castain 2010-03-05 13:22:12 +00:00
родитель ef6c432e22
Коммит f2c65dc70f
10 изменённых файлов: 34 добавлений и 21 удалений

Просмотреть файл

@ -79,7 +79,7 @@ void orte_errmgr_base_error_abort(int error_code, char *fmt, ...)
/* if I am a daemon or the HNP... */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
/* whack my local procs */
orte_odls.kill_local_procs(NULL, false);
orte_odls.kill_local_procs(NULL);
/* whack any session directories */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
} else {

Просмотреть файл

@ -46,6 +46,7 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
{
int rc;
orte_job_t *jdata;
orte_proc_t *proc;
int i;
/* get the job data object for this process */
@ -54,6 +55,16 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
goto PROCESS;
}
/* if the proc was terminated by cmd, ignore it */
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) {
/* nothing we can do */
goto PROCESS;
}
if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
/* don't do anything or else we can enter an infinite loop */
return;
}
if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_ABORTED & jdata->err_cbstates)) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
"%s errmgr:default: proc %s aborted with status %d - calling cbfunc",

Просмотреть файл

@ -2265,7 +2265,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
orte_odls_child_t *child;
orte_std_cntr_t cnt;
int rc;
bool found=false;
bool found=false, registering=false;
int8_t flag;
orte_odls_job_t *jobdat, *jdat;
@ -2320,6 +2320,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
* unpack the contact info from the buffer and store it
*/
child->init_recvd = true;
registering = true;
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &(child->rml_uri), &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
@ -2386,6 +2387,11 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
}
OBJ_DESTRUCT(&buffer);
/* if we are deregistering, then we are done */
if (!registering) {
goto CLEANUP;
}
/* now check to see if everyone in this job has registered */
if (all_children_registered(proc->jobid)) {
/* once everyone registers, send their contact info to
@ -2948,7 +2954,7 @@ CLEANUP:
return;
}
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state,
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
orte_odls_base_kill_local_fn_t kill_local,
orte_odls_base_child_died_fn_t child_died)
{

Просмотреть файл

@ -128,7 +128,7 @@ typedef int (*orte_odls_base_kill_local_fn_t)(pid_t pid, int signum);
typedef bool (*orte_odls_base_child_died_fn_t)(pid_t pid, unsigned int timeout, int *exit_status);
ORTE_DECLSPEC int
orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state,
orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
orte_odls_base_kill_local_fn_t kill_local,
orte_odls_base_child_died_fn_t child_died);

Просмотреть файл

@ -88,7 +88,7 @@
* External Interface
*/
static int orte_odls_default_launch_local_procs(opal_buffer_t *data);
static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state);
static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs);
static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
static void set_handler_default(int sig);
@ -200,11 +200,11 @@ static int odls_default_kill_local(pid_t pid, int signum)
return 0;
}
int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_state)
int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, set_state,
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
odls_default_kill_local, odls_default_child_died))) {
ORTE_ERROR_LOG(rc);
return rc;

Просмотреть файл

@ -62,7 +62,7 @@ typedef int (*orte_odls_base_module_launch_local_processes_fn_t)(opal_buffer_t *
/**
* Kill the local processes on this node
*/
typedef int (*orte_odls_base_module_kill_local_processes_fn_t)(opal_pointer_array_t *procs, bool set_state);
typedef int (*orte_odls_base_module_kill_local_processes_fn_t)(opal_pointer_array_t *procs);
/**
* Signal local processes

Просмотреть файл

@ -74,11 +74,11 @@ static int odls_process_kill_local( pid_t pid, int sig_num )
return 0;
}
static int odls_process_kill_local_procs(opal_pointer_array_t *procs, bool set_state)
static int odls_process_kill_local_procs(opal_pointer_array_t *procs)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, set_state,
if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
odls_process_kill_local, odls_process_child_died))) {
ORTE_ERROR_LOG(rc);
return rc;

Просмотреть файл

@ -201,10 +201,8 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
opal_output(0, "%s ORTED_CMD_PROCESSOR: STUCK IN INFINITE LOOP - ABORTING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
OBJ_RELEASE(mev);
/* make sure our local procs are dead - but don't update their state
* on the HNP as this may be redundant
*/
orte_odls.kill_local_procs(NULL, false);
/* make sure our local procs are dead */
orte_odls.kill_local_procs(NULL);
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
@ -407,7 +405,7 @@ static int process_commands(orte_process_name_t* sender,
}
/* kill the procs */
if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(&procarray, true))) {
if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(&procarray))) {
ORTE_ERROR_LOG(ret);
}
@ -625,7 +623,7 @@ static int process_commands(orte_process_name_t* sender,
}
/* if we are the HNP, just kill our local procs */
if (ORTE_PROC_IS_HNP) {
orte_odls.kill_local_procs(NULL, false);
orte_odls.kill_local_procs(NULL);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -865,10 +865,8 @@ static void shutdown_callback(int fd, short flags, void *arg)
unlink(log_path);
}
/* make sure our local procs are dead - but don't update their state
* on the HNP as this may be redundant
*/
orte_odls.kill_local_procs(NULL, false);
/* make sure our local procs are dead */
orte_odls.kill_local_procs(NULL);
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

Просмотреть файл

@ -1169,7 +1169,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
if (forcibly_die) {
/* kill any local procs */
orte_odls.kill_local_procs(NULL, false);
orte_odls.kill_local_procs(NULL);
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);