Record the daemon's state so that we don't attempt to send "die" messages to a daemon that is known to have failed to start.
This commit was SVN r18044.
Этот коммит содержится в:
родитель
ee784b601e
Коммит
f3936ff9bc
@ -169,6 +169,11 @@ int orte_plm_base_orted_exit(void)
|
||||
peer.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
for(v=1; v < orte_process_info.num_procs; v++) {
|
||||
peer.vpid = v;
|
||||
/* check to see if this daemon is known to be "dead" */
|
||||
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* don't try to send this */
|
||||
continue;
|
||||
}
|
||||
/* don't worry about errors on the send here - just
|
||||
* issue it and keep going
|
||||
*/
|
||||
@ -236,11 +241,20 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
|
||||
if (orte_abnormal_term_ordered) {
|
||||
orte_vpid_t v;
|
||||
orte_process_name_t peer;
|
||||
orte_job_t *daemons;
|
||||
orte_proc_t **procs;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:orted_cmd:kill_local_procs abnormal term ordered",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* get the job object for the daemons */
|
||||
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
procs = (orte_proc_t**)daemons->procs->addr;
|
||||
|
||||
/* since we cannot know which daemons may/may not be alive,
|
||||
* setup an event so we will time out after giving the send
|
||||
* our best attempt
|
||||
@ -272,6 +286,11 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
|
||||
peer.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
for(v=1; v < orte_process_info.num_procs; v++) {
|
||||
peer.vpid = v;
|
||||
/* check to see if this daemon is known to be "dead" */
|
||||
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* don't try to send this */
|
||||
continue;
|
||||
}
|
||||
/* don't worry about errors on the send here - just
|
||||
* issue it and keep going
|
||||
*/
|
||||
|
@ -276,8 +276,11 @@ static int orte_plm_rsh_probe(orte_node_t *node,
|
||||
static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
{
|
||||
unsigned long deltat;
|
||||
orte_proc_t *daemon=(orte_proc_t*)cbdata;
|
||||
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
|
||||
/* note that this daemon failed */
|
||||
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
/* report that the daemon has failed so we can exit */
|
||||
orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
@ -745,6 +748,8 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
} else { /* father */
|
||||
/* indicate this daemon has been launched */
|
||||
nodes[nnode]->daemon->state = ORTE_PROC_STATE_LAUNCHED;
|
||||
/* record the pid */
|
||||
nodes[nnode]->daemon->pid = pid;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock);
|
||||
/* This situation can lead to a deadlock if '--debug-daemons' is set.
|
||||
@ -760,7 +765,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
/* setup callback on sigchild - wait until setup above is complete
|
||||
* as the callback can occur in the call to orte_wait_cb
|
||||
*/
|
||||
orte_wait_cb(pid, orte_plm_rsh_wait_daemon, NULL);
|
||||
orte_wait_cb(pid, orte_plm_rsh_wait_daemon, (void*)nodes[nnode]->daemon);
|
||||
|
||||
/* if required - add delay to avoid problems w/ X11 authentication */
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user