1
1

Fix single-node operations so that the HNP correctly exits when the job completes

This commit was SVN r18556.
Этот коммит содержится в:
Ralph Castain 2008-06-03 14:23:04 +00:00
родитель b456fb2d42
Коммит 95578b0528
2 изменённых файлов: 21 добавлений и 5 удалений

Просмотреть файл

@ -117,9 +117,6 @@ int orte_plm_base_orted_exit(void)
}
procs = (orte_proc_t**)daemons->procs->addr;
procs[0]->state = ORTE_PROC_STATE_TERMINATED;
daemons->num_terminated++;
/* pack the command */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
@ -141,6 +138,9 @@ int orte_plm_base_orted_exit(void)
"%s plm:base:orted_cmd:orted_exit abnormal term ordered",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* be sure I get the command */
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmd, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
/* now send the command one daemon at a time using a non-blocking
* send - let the callback function keep track of how many
* complete - it will delete the event if they all do.

Просмотреть файл

@ -68,6 +68,7 @@
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/routed/routed.h"
#include "orte/runtime/runtime.h"
@ -557,9 +558,24 @@ static int process_commands(orte_process_name_t* sender,
/**** EXIT COMMAND ****/
case ORTE_DAEMON_EXIT_CMD:
if (orte_process_info.hnp) {
/* if we are the HNP, do nothing - we will
* exit at our own sweet time
orte_job_t *daemons;
orte_proc_t **procs;
/* if we are the HNP, ensure our local procs are terminated */
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
/* now lookup the daemon job object */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
procs = (orte_proc_t**)daemons->procs->addr;
/* declare us terminated so things can exit cleanly */
procs[0]->state = ORTE_PROC_STATE_TERMINATED;
daemons->num_terminated++;
/* need to check for job complete as otherwise this doesn't
* get triggered in single-daemon systems
*/
orte_plm_base_check_job_completed(daemons);
/* all done! */
return ORTE_SUCCESS;
}
/* eventually, we need to revise this so we only