Record the daemon's state so that we don't attempt to send "die" messages to a daemon that is known to have failed to start.
This commit was SVN r18044.
Этот коммит содержится в:
родитель
ee784b601e
Коммит
f3936ff9bc
@ -169,6 +169,11 @@ int orte_plm_base_orted_exit(void)
|
|||||||
peer.jobid = ORTE_PROC_MY_NAME->jobid;
|
peer.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||||
for(v=1; v < orte_process_info.num_procs; v++) {
|
for(v=1; v < orte_process_info.num_procs; v++) {
|
||||||
peer.vpid = v;
|
peer.vpid = v;
|
||||||
|
/* check to see if this daemon is known to be "dead" */
|
||||||
|
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
|
||||||
|
/* don't try to send this */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
/* don't worry about errors on the send here - just
|
/* don't worry about errors on the send here - just
|
||||||
* issue it and keep going
|
* issue it and keep going
|
||||||
*/
|
*/
|
||||||
@ -236,11 +241,20 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
|
|||||||
if (orte_abnormal_term_ordered) {
|
if (orte_abnormal_term_ordered) {
|
||||||
orte_vpid_t v;
|
orte_vpid_t v;
|
||||||
orte_process_name_t peer;
|
orte_process_name_t peer;
|
||||||
|
orte_job_t *daemons;
|
||||||
|
orte_proc_t **procs;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
"%s plm:base:orted_cmd:kill_local_procs abnormal term ordered",
|
"%s plm:base:orted_cmd:kill_local_procs abnormal term ordered",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
|
/* get the job object for the daemons */
|
||||||
|
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
procs = (orte_proc_t**)daemons->procs->addr;
|
||||||
|
|
||||||
/* since we cannot know which daemons may/may not be alive,
|
/* since we cannot know which daemons may/may not be alive,
|
||||||
* setup an event so we will time out after giving the send
|
* setup an event so we will time out after giving the send
|
||||||
* our best attempt
|
* our best attempt
|
||||||
@ -272,6 +286,11 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
|
|||||||
peer.jobid = ORTE_PROC_MY_NAME->jobid;
|
peer.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||||
for(v=1; v < orte_process_info.num_procs; v++) {
|
for(v=1; v < orte_process_info.num_procs; v++) {
|
||||||
peer.vpid = v;
|
peer.vpid = v;
|
||||||
|
/* check to see if this daemon is known to be "dead" */
|
||||||
|
if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
|
||||||
|
/* don't try to send this */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
/* don't worry about errors on the send here - just
|
/* don't worry about errors on the send here - just
|
||||||
* issue it and keep going
|
* issue it and keep going
|
||||||
*/
|
*/
|
||||||
|
@ -276,8 +276,11 @@ static int orte_plm_rsh_probe(orte_node_t *node,
|
|||||||
static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||||
{
|
{
|
||||||
unsigned long deltat;
|
unsigned long deltat;
|
||||||
|
orte_proc_t *daemon=(orte_proc_t*)cbdata;
|
||||||
|
|
||||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
|
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { /* if abnormal exit */
|
||||||
|
/* note that this daemon failed */
|
||||||
|
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
/* report that the daemon has failed so we can exit */
|
/* report that the daemon has failed so we can exit */
|
||||||
orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
|
orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
}
|
}
|
||||||
@ -745,6 +748,8 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
|||||||
} else { /* father */
|
} else { /* father */
|
||||||
/* indicate this daemon has been launched */
|
/* indicate this daemon has been launched */
|
||||||
nodes[nnode]->daemon->state = ORTE_PROC_STATE_LAUNCHED;
|
nodes[nnode]->daemon->state = ORTE_PROC_STATE_LAUNCHED;
|
||||||
|
/* record the pid */
|
||||||
|
nodes[nnode]->daemon->pid = pid;
|
||||||
|
|
||||||
OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock);
|
OPAL_THREAD_LOCK(&mca_plm_rsh_component.lock);
|
||||||
/* This situation can lead to a deadlock if '--debug-daemons' is set.
|
/* This situation can lead to a deadlock if '--debug-daemons' is set.
|
||||||
@ -760,7 +765,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
|||||||
/* setup callback on sigchild - wait until setup above is complete
|
/* setup callback on sigchild - wait until setup above is complete
|
||||||
* as the callback can occur in the call to orte_wait_cb
|
* as the callback can occur in the call to orte_wait_cb
|
||||||
*/
|
*/
|
||||||
orte_wait_cb(pid, orte_plm_rsh_wait_daemon, NULL);
|
orte_wait_cb(pid, orte_plm_rsh_wait_daemon, (void*)nodes[nnode]->daemon);
|
||||||
|
|
||||||
/* if required - add delay to avoid problems w/ X11 authentication */
|
/* if required - add delay to avoid problems w/ X11 authentication */
|
||||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)
|
if (0 < opal_output_get_verbosity(orte_plm_globals.output)
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user