From f3936ff9bc9cade486c8aecb10669ab1aeb93065 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 31 Mar 2008 18:15:24 +0000 Subject: [PATCH] Record the daemon's state so that we don't attempt to send "die" messages to a daemon that is known to have failed to start. This commit was SVN r18044. --- orte/mca/plm/base/plm_base_orted_cmds.c | 19 +++++++++++++++++++ orte/mca/plm/rsh/plm_rsh_module.c | 7 ++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/orte/mca/plm/base/plm_base_orted_cmds.c b/orte/mca/plm/base/plm_base_orted_cmds.c index b1f329ab0f..028de97850 100644 --- a/orte/mca/plm/base/plm_base_orted_cmds.c +++ b/orte/mca/plm/base/plm_base_orted_cmds.c @@ -169,6 +169,11 @@ int orte_plm_base_orted_exit(void) peer.jobid = ORTE_PROC_MY_NAME->jobid; for(v=1; v < orte_process_info.num_procs; v++) { peer.vpid = v; + /* check to see if this daemon is known to be "dead" */ + if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) { + /* don't try to send this */ + continue; + } /* don't worry about errors on the send here - just * issue it and keep going */ @@ -236,11 +241,20 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job) if (orte_abnormal_term_ordered) { orte_vpid_t v; orte_process_name_t peer; + orte_job_t *daemons; + orte_proc_t **procs; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:orted_cmd:kill_local_procs abnormal term ordered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* get the job object for the daemons */ + if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + procs = (orte_proc_t**)daemons->procs->addr; + /* since we cannot know which daemons may/may not be alive, * setup an event so we will time out after giving the send * our best attempt @@ -272,6 +286,11 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job) peer.jobid = ORTE_PROC_MY_NAME->jobid; for(v=1; v < orte_process_info.num_procs; v++) { peer.vpid = v; + /* check to see if this daemon is known to be "dead" */ + if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) { + /* don't try to send this */ + continue; + } /* don't worry about errors on the send here - just * issue it and keep going */ diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index fbdc0a0a7f..b0ba2ba9ba 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -276,8 +276,11 @@ static int orte_plm_rsh_probe(orte_node_t *node, static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata) { unsigned long deltat; + orte_proc_t *daemon=(orte_proc_t*)cbdata; if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */ + /* note that this daemon failed */ + daemon->state = ORTE_PROC_STATE_FAILED_TO_START; /* report that the daemon has failed so we can exit */ orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START); } @@ -745,6 +748,8 @@ int orte_plm_rsh_launch(orte_job_t *jdata) } else { /* father */ /* indicate this daemon has been launched */ nodes[nnode]->daemon->state = ORTE_PROC_STATE_LAUNCHED; + /* record the pid */ + nodes[nnode]->daemon->pid = pid; OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock); /* This situation can lead to a deadlock if '--debug-daemons' is set. @@ -760,7 +765,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata) /* setup callback on sigchild - wait until setup above is complete * as the callback can occur in the call to orte_wait_cb */ - orte_wait_cb(pid, orte_plm_rsh_wait_daemon, NULL); + orte_wait_cb(pid, orte_plm_rsh_wait_daemon, (void*)nodes[nnode]->daemon); /* if required - add delay to avoid problems w/ X11 authentication */ if (0 < opal_output_get_verbosity(orte_plm_globals.output)