1
1

If we are abnormally terminating, then don't wait for orteds to report back. Send them a "halt_vm" command, which instructs them to kill their local procs and immediately terminate, doing their best to cleanup on the way out.

Also do a little cleanup on debug output in rshbase.

This commit was SVN r25033.
Этот коммит содержится в:
Ralph Castain 2011-08-09 17:42:19 +00:00
родитель f96db45c17
Коммит f1951e7ccd
10 изменённых файлов: 141 добавлений и 41 удалений

Просмотреть файл

@ -424,9 +424,22 @@ static int plm_alps_terminate_orteds(void)
*/
orte_wait_cb_cancel(alps_pid);
/* tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;

Просмотреть файл

@ -606,11 +606,24 @@ int plm_ccp_terminate_orteds()
{
int rc;
/* now tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
}

Просмотреть файл

@ -378,11 +378,24 @@ static int plm_lsf_terminate_orteds(void)
{
int rc;
/* tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
}

Просмотреть файл

@ -1380,9 +1380,22 @@ int orte_plm_process_terminate_orteds(void)
{
int rc;
/* now tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;

Просмотреть файл

@ -1495,11 +1495,22 @@ int orte_plm_rsh_terminate_orteds(void)
{
int rc;
/* now tell them to die - we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;

Просмотреть файл

@ -261,7 +261,7 @@ static int spawn(orte_job_t *jdata)
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: launching job %s",
"%s plm:rshbase: launching job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
@ -278,7 +278,7 @@ static int spawn(orte_job_t *jdata)
if (0 == map->num_new_daemons) {
/* have all the daemons we need - launch app */
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: no new daemons to launch",
"%s plm:rshbase: no new daemons to launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto launch_apps;
}
@ -383,7 +383,7 @@ static int spawn(orte_job_t *jdata)
/* if this daemon already exists, don't launch it! */
if (node->daemon_launched) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh:launch daemon already exists on node %s",
"%s plm:rshbase:launch daemon already exists on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
continue;
@ -395,7 +395,7 @@ static int spawn(orte_job_t *jdata)
if (NULL == node->daemon) {
ORTE_ERROR_LOG(ORTE_ERR_FATAL);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh:launch daemon failed to be defined on node %s",
"%s plm:rshbase:launch daemon failed to be defined on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
rc = ORTE_ERR_FATAL;
@ -413,7 +413,7 @@ static int spawn(orte_job_t *jdata)
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: launching on node %s",
"%s plm:rshbase: launching on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name));
@ -439,7 +439,7 @@ static int spawn(orte_job_t *jdata)
node->daemon->pid = pid;
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: recording launch of daemon %s",
"%s plm:rshbase: recording launch of daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&node->daemon->name)));
@ -477,7 +477,7 @@ static int spawn(orte_job_t *jdata)
failed_job = active_job;
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:rsh: launch of apps failed for job %s on error %s",
"%s plm:rshbase: launch of apps failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup;
@ -499,9 +499,9 @@ static int spawn(orte_job_t *jdata)
if (orte_timing ) {
if (0 != gettimeofday(&joblaunchstop, NULL)) {
opal_output(0, "plm_rsh: could not obtain job launch stop time");
opal_output(0, "plm:rshbase: could not obtain job launch stop time");
} else {
opal_output(0, "plm_rsh: total job launch time is %ld usec",
opal_output(0, "plm:rshbase: total job launch time is %ld usec",
(joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
(joblaunchstop.tv_usec - joblaunchstart.tv_usec));
}
@ -528,11 +528,22 @@ static int terminate_orteds(void)
{
int rc;
/* now tell them to die - we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;

Просмотреть файл

@ -473,7 +473,7 @@ static int plm_slurm_terminate_orteds(void)
/* tell them to die without sending a reply - we will rely on the
* waitpid to tell us when they have exited!
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}

Просмотреть файл

@ -936,9 +936,22 @@ int orte_plm_submit_terminate_orteds(void)
{
int rc;
/* now tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;

Просмотреть файл

@ -490,9 +490,22 @@ int plm_tm_terminate_orteds(void)
{
int rc;
/* now tell them to die! */
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
/* now tell them to die */
if (orte_abnormal_term_ordered) {
/* cannot know if a daemon is able to
* tell us it died, so just ensure they
* all terminate
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD))) {
ORTE_ERROR_LOG(rc);
}
} else {
/* we need them to "phone home", though,
* so we can know that they have exited
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;

Просмотреть файл

@ -201,7 +201,7 @@ orte_plm_xgrid_terminate_orteds(void)
{
int rc;
rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD);
rc = orte_plm_base_orted_exit(ORTE_DAEMON_HALT_VM_CMD);
if (ORTE_SUCCESS != rc) {
rc = [mca_plm_xgrid_component.client terminateOrteds];
}