Ensure that mpirun properly accounts for itself when exiting without reply.
Move some debug output around so it is always seen. This commit was SVN r20026.
Этот коммит содержится в:
родитель
9a57db4a81
Коммит
5e6536eeda
@ -617,6 +617,10 @@ static int process_commands(orte_process_name_t* sender,
|
|||||||
|
|
||||||
/**** EXIT COMMAND ****/
|
/**** EXIT COMMAND ****/
|
||||||
case ORTE_DAEMON_EXIT_WITH_REPLY_CMD:
|
case ORTE_DAEMON_EXIT_WITH_REPLY_CMD:
|
||||||
|
if (orte_debug_daemons_flag) {
|
||||||
|
opal_output(0, "%s orted_cmd: received exit",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
}
|
||||||
/* disable routing - we need to do this
|
/* disable routing - we need to do this
|
||||||
* because daemons exit in an uncoordinated fashion.
|
* because daemons exit in an uncoordinated fashion.
|
||||||
* Thus, our routes are being dismantled, so we can't
|
* Thus, our routes are being dismantled, so we can't
|
||||||
@ -650,11 +654,6 @@ static int process_commands(orte_process_name_t* sender,
|
|||||||
/* if we are not the HNP, send a message to the HNP telling
|
/* if we are not the HNP, send a message to the HNP telling
|
||||||
* it we are leaving - and then trigger our exit
|
* it we are leaving - and then trigger our exit
|
||||||
*/
|
*/
|
||||||
if (orte_debug_daemons_flag) {
|
|
||||||
opal_output(0, "%s orted_cmd: received exit",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
||||||
}
|
|
||||||
/* send a state update so the HNP knows we are "gone" */
|
|
||||||
{
|
{
|
||||||
opal_buffer_t ack;
|
opal_buffer_t ack;
|
||||||
orte_proc_state_t state=ORTE_PROC_STATE_TERMINATED;
|
orte_proc_state_t state=ORTE_PROC_STATE_TERMINATED;
|
||||||
@ -670,10 +669,7 @@ static int process_commands(orte_process_name_t* sender,
|
|||||||
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0);
|
orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0);
|
||||||
OBJ_DESTRUCT(&ack);
|
OBJ_DESTRUCT(&ack);
|
||||||
}
|
}
|
||||||
/* trigger our appropriate exit procedure
|
/* check to see if we need to relay messages */
|
||||||
* NOTE: this event will fire -after- any zero-time events
|
|
||||||
* so any pending relays -do- get sent first
|
|
||||||
*/
|
|
||||||
if (relay_is_required) {
|
if (relay_is_required) {
|
||||||
exit_after_relay = true;
|
exit_after_relay = true;
|
||||||
} else {
|
} else {
|
||||||
@ -684,6 +680,10 @@ static int process_commands(orte_process_name_t* sender,
|
|||||||
|
|
||||||
/**** EXIT_NO_REPLY COMMAND ****/
|
/**** EXIT_NO_REPLY COMMAND ****/
|
||||||
case ORTE_DAEMON_EXIT_NO_REPLY_CMD:
|
case ORTE_DAEMON_EXIT_NO_REPLY_CMD:
|
||||||
|
if (orte_debug_daemons_flag) {
|
||||||
|
opal_output(0, "%s orted_cmd: received exit_no_reply",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
}
|
||||||
/* disable routing - we need to do this
|
/* disable routing - we need to do this
|
||||||
* because daemons exit in an uncoordinated fashion.
|
* because daemons exit in an uncoordinated fashion.
|
||||||
* Thus, our routes are being dismantled, so we can't
|
* Thus, our routes are being dismantled, so we can't
|
||||||
@ -694,24 +694,25 @@ static int process_commands(orte_process_name_t* sender,
|
|||||||
* flag we are exited - but don't yet exit
|
* flag we are exited - but don't yet exit
|
||||||
*/
|
*/
|
||||||
if (orte_process_info.hnp) {
|
if (orte_process_info.hnp) {
|
||||||
|
orte_job_t *daemons;
|
||||||
|
orte_proc_t **procs;
|
||||||
/* if we are the HNP, ensure our local procs are terminated */
|
/* if we are the HNP, ensure our local procs are terminated */
|
||||||
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
|
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
|
||||||
|
/* now lookup the daemon job object */
|
||||||
|
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
procs = (orte_proc_t**)daemons->procs->addr;
|
||||||
|
/* declare us terminated so things can exit cleanly */
|
||||||
|
procs[0]->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
|
daemons->num_terminated++;
|
||||||
/* There is nothing more to do here - actual exit will be
|
/* There is nothing more to do here - actual exit will be
|
||||||
* accomplished by the plm
|
* accomplished by the plm
|
||||||
*/
|
*/
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
/* if we are not the HNP, don't send any messages - just
|
/* check to see if we need to relay messages */
|
||||||
* trigger our exit
|
|
||||||
*/
|
|
||||||
if (orte_debug_daemons_flag) {
|
|
||||||
opal_output(0, "%s orted_cmd: received exit_no_reply",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
||||||
}
|
|
||||||
/* trigger our appropriate exit procedure
|
|
||||||
* NOTE: this event will fire -after- any zero-time events
|
|
||||||
* so any pending relays -do- get sent first
|
|
||||||
*/
|
|
||||||
if (relay_is_required) {
|
if (relay_is_required) {
|
||||||
exit_after_relay = true;
|
exit_after_relay = true;
|
||||||
} else {
|
} else {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user