1
1

Record the daemon's state so that we don't attempt to send "die" messages to a daemon that is known to have failed to start.

This commit was SVN r18044.
Этот коммит содержится в:
Ralph Castain 2008-03-31 18:15:24 +00:00
родитель ee784b601e
Коммит f3936ff9bc
2 изменённых файлов: 25 добавлений и 1 удалений

Просмотреть файл

@ -169,6 +169,11 @@ int orte_plm_base_orted_exit(void)
peer.jobid = ORTE_PROC_MY_NAME->jobid;
for(v=1; v < orte_process_info.num_procs; v++) {
peer.vpid = v;
/* check to see if this daemon is known to be "dead" */
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
/* don't try to send this */
continue;
}
/* don't worry about errors on the send here - just
* issue it and keep going
*/
@ -236,11 +241,20 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
if (orte_abnormal_term_ordered) {
orte_vpid_t v;
orte_process_name_t peer;
orte_job_t *daemons;
orte_proc_t **procs;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:orted_cmd:kill_local_procs abnormal term ordered",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* get the job object for the daemons */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
procs = (orte_proc_t**)daemons->procs->addr;
/* since we cannot know which daemons may/may not be alive,
* setup an event so we will time out after giving the send
* our best attempt
@ -272,6 +286,11 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
peer.jobid = ORTE_PROC_MY_NAME->jobid;
for(v=1; v < orte_process_info.num_procs; v++) {
peer.vpid = v;
/* check to see if this daemon is known to be "dead" */
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
/* don't try to send this */
continue;
}
/* don't worry about errors on the send here - just
* issue it and keep going
*/

Просмотреть файл

@ -276,8 +276,11 @@ static int orte_plm_rsh_probe(orte_node_t *node,
static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
{
unsigned long deltat;
orte_proc_t *daemon=(orte_proc_t*)cbdata;
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
/* note that this daemon failed */
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
/* report that the daemon has failed so we can exit */
orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
}
@ -745,6 +748,8 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
} else { /* father */
/* indicate this daemon has been launched */
nodes[nnode]->daemon->state = ORTE_PROC_STATE_LAUNCHED;
/* record the pid */
nodes[nnode]->daemon->pid = pid;
OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock);
/* This situation can lead to a deadlock if '--debug-daemons' is set.
@ -760,7 +765,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
/* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb
*/
orte_wait_cb(pid, orte_plm_rsh_wait_daemon, NULL);
orte_wait_cb(pid, orte_plm_rsh_wait_daemon, (void*)nodes[nnode]->daemon);
/* if required - add delay to avoid problems w/ X11 authentication */
if (0 < opal_output_get_verbosity(orte_plm_globals.output)