1
1
Tested this functionality quite a bit more and made some fixes:

 * Print far fewer help messages
 * Fix one additional deadlock upon error
 * Change some ORTE_LOG messages to silent (because they're not
   errors)
 * Some code got re-indented, sorry...

Discussed and reviewed with Ralph.

This commit was SVN r13375.

The following Trac tickets were found above:
  Ticket 726 --> https://svn.open-mpi.org/trac/ompi/ticket/726
Этот коммит содержится в:
Jeff Squyres 2007-01-30 23:03:13 +00:00
родитель 78a13bc3ea
Коммит 8d872b195a
3 изменённых файлов: 95 добавлений и 71 удалений

Просмотреть файл

@ -106,8 +106,9 @@ int orte_pls_base_orted_cancel_operation(void)
/* cancel any waiting receive - we don't want to hear it */ /* cancel any waiting receive - we don't want to hear it */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
/* set the completion status to reflect cancellation */ /* set the completion status to reflect cancellation -- no need to
completion_status = ORTE_ERR_INTERUPTED; print anything */
completion_status = ORTE_ERR_SILENT;
/* declare us "done" so we can exit cleanly */ /* declare us "done" so we can exit cleanly */
opal_condition_signal(&orte_pls_base.orted_cmd_cond); opal_condition_signal(&orte_pls_base.orted_cmd_cond);

Просмотреть файл

@ -91,9 +91,7 @@ Returned value %d instead of ORTE_SUCCESS.
[orterun:proc-aborted-strsignal] [orterun:proc-aborted-strsignal]
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d (%s). %s noticed that job rank %lu with PID %lu on node %s exited on signal %d (%s).
[orterun:abnormal-exit] [orterun:abnormal-exit]
WARNING: %s encountered an abnormal exit. WARNING: %s has exited before it received notification that all
This means that %s exited before it received notification that all
started processes had terminated. You should double check and ensure started processes had terminated. You should double check and ensure
that there are no runaway processes still executing. that there are no runaway processes still executing.
# #
@ -105,12 +103,6 @@ It is dangerous to interrupt %s while it is killing a job (proper
termination may not be guaranteed). Hit control-C again within 1 termination may not be guaranteed). Hit control-C again within 1
second if you really want to kill %s immediately. second if you really want to kill %s immediately.
# #
[orterun:forced-end-failed]
WARNING: %s was ordered to kill a job (probably with control-C), but
was unable to successfully complete that order (returned error %s).
You should double check and ensure that there are no runaway processes
still executing.
#
[orterun:empty-prefix] [orterun:empty-prefix]
A prefix was supplied to %s that only contained slashes. A prefix was supplied to %s that only contained slashes.

Просмотреть файл

@ -436,8 +436,11 @@ int orterun(int argc, char *argv[])
&orterun_globals.lock); &orterun_globals.lock);
} }
/* check to see if the job was aborted */ /* check to see if the job was aborted */
if (ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) { if (ORTE_JOBID_INVALID != jobid &&
ORTE_ERROR_LOG(rc); ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
/* define the exit state as abnormal by default */ /* define the exit state as abnormal by default */
exit_state = ORTE_JOB_STATE_ABORTED; exit_state = ORTE_JOB_STATE_ABORTED;
} }
@ -472,16 +475,26 @@ int orterun(int argc, char *argv[])
* be sure to include any descendants so nothing is * be sure to include any descendants so nothing is
* left hanging * left hanging
*/ */
OBJ_CONSTRUCT(&attributes, opal_list_t); if (ORTE_JOBID_INVALID != jobid) {
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); OBJ_CONSTRUCT(&attributes, opal_list_t);
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) { orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
opal_show_help("help-orterun.txt", "orterun:daemon-die", true, if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
orterun_basename, ORTE_ERROR_NAME(ret)); opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
orterun_basename, ORTE_ERROR_NAME(ret));
}
while (NULL != (item = opal_list_remove_first(&attributes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&attributes);
} }
while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attributes);
OPAL_THREAD_UNLOCK(&orterun_globals.lock); OPAL_THREAD_UNLOCK(&orterun_globals.lock);
/* If we were forcibly killed, print a warning that the
user may still have some manual cleanup to do. */
if (ORTE_JOBID_INVALID == jobid) {
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
true, orterun_basename, orterun_basename);
}
} }
} }
@ -732,54 +745,56 @@ static void abort_signal_callback(int fd, short flags, void *arg)
/* If this whole process has already completed, then bail */ /* If this whole process has already completed, then bail */
switch (state) { switch (state) {
case ABORT_SIGNAL_FIRST: case ABORT_SIGNAL_FIRST:
/* This is the first time through */ /* This is the first time through */
state = ABORT_SIGNAL_PROCESSING; state = ABORT_SIGNAL_PROCESSING;
break; break;
case ABORT_SIGNAL_WARNED: case ABORT_SIGNAL_WARNED:
gettimeofday(&now, NULL); gettimeofday(&now, NULL);
a = invoked.tv_sec * 1000000 + invoked.tv_usec; a = invoked.tv_sec * 1000000 + invoked.tv_usec;
b = now.tv_sec * 1000000 + invoked.tv_usec; b = now.tv_sec * 1000000 + invoked.tv_usec;
if (b - a <= 1000000) { if (b - a <= 1000000) {
/* tell the pls to cancel the terminate request - if (!orterun_globals.quiet){
* obviously, something is wrong at this point fprintf(stderr, "%s: forcibly killing job...\n",
*/ orterun_basename);
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
ORTE_ERROR_LOG(ret);
}
/* give the user the warning about manual cleanup */
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
true, orterun_basename, orterun_basename);
/* We are in an event handler; exit_callback() will delete
the handler that is currently running (which is a Bad
Thing), so we can't call it directly. Instead, we have
to exit this handler and setup to call exit_handler()
after this. */
if (NULL != (event = (opal_event_t*)
malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
now.tv_sec = 0;
now.tv_usec = 0;
opal_evtimer_add(event, &now);
state = ABORT_SIGNAL_DONE;
}
return;
} }
/* Otherwise fall through to PROCESSING and warn again */
case ABORT_SIGNAL_PROCESSING: /* tell the pls to cancel the terminate request -
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing", * obviously, something is wrong at this point
true, orterun_basename, orterun_basename, */
orterun_basename); if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
gettimeofday(&invoked, NULL); ORTE_ERROR_LOG(ret);
state = ABORT_SIGNAL_WARNED; }
return;
case ABORT_SIGNAL_DONE: /* We are in an event handler; exit_callback() will delete
/* Nothing to do -- return */ the handler that is currently running (which is a Bad
return; Thing), so we can't call it directly. Instead, we have
to exit this handler and setup to call exit_handler()
after this. */
if (NULL != (event = (opal_event_t*)
malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
now.tv_sec = 0;
now.tv_usec = 0;
opal_evtimer_add(event, &now);
state = ABORT_SIGNAL_DONE;
}
return;
}
/* Otherwise fall through to PROCESSING and warn again */
case ABORT_SIGNAL_PROCESSING:
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
true, orterun_basename, orterun_basename,
orterun_basename);
gettimeofday(&invoked, NULL);
state = ABORT_SIGNAL_WARNED;
return;
case ABORT_SIGNAL_DONE:
/* Nothing to do -- return */
return;
} }
if (!orterun_globals.quiet){ if (!orterun_globals.quiet){
@ -794,17 +809,33 @@ static void abort_signal_callback(int fd, short flags, void *arg)
OBJ_CONSTRUCT(&attrs, opal_list_t); OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE); orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs); ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs);
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); while (NULL != (item = opal_list_remove_first(&attrs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&attrs); OBJ_DESTRUCT(&attrs);
if (ORTE_SUCCESS != ret) { if (ORTE_SUCCESS != ret) {
opal_show_help("help-orterun.txt", "orterun:forced-end-failed", /* If we failed the terminate_job() above, then the
true, orterun_basename, ORTE_ERROR_NAME(ret)); condition variable in the main loop in orterun won't
wake up. So signal it. */
if (NULL != (event = (opal_event_t*)
malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
now.tv_sec = 0;
now.tv_usec = 0;
opal_evtimer_add(event, &now);
} else {
/* We really don't want to do this, but everything
else has failed... */
orterun_globals.exit = true;
orterun_globals.exit_status = 1;
opal_condition_signal(&orterun_globals.cond);
}
jobid = ORTE_JOBID_INVALID; jobid = ORTE_JOBID_INVALID;
} }
} }
state = ABORT_SIGNAL_DONE; state = ABORT_SIGNAL_DONE;
state = ABORT_SIGNAL_DONE;
} }
/** /**