1
1
Tested this functionality quite a bit more and made some fixes:

 * Print far fewer help messages
 * Fix one additional deadlock upon error
 * Change some ORTE_LOG messages to silent (because they're not
   errors)
 * Some code got re-indented, sorry...

Discussed and reviewed with Ralph.

This commit was SVN r13375.

The following Trac tickets were found above:
  Ticket 726 --> https://svn.open-mpi.org/trac/ompi/ticket/726
Этот коммит содержится в:
Jeff Squyres 2007-01-30 23:03:13 +00:00
родитель 78a13bc3ea
Коммит 8d872b195a
3 изменённых файлов: 95 добавлений и 71 удалений

Просмотреть файл

@ -106,8 +106,9 @@ int orte_pls_base_orted_cancel_operation(void)
/* cancel any waiting receive - we don't want to hear it */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
/* set the completion status to reflect cancellation */
completion_status = ORTE_ERR_INTERUPTED;
/* set the completion status to reflect cancellation -- no need to
print anything */
completion_status = ORTE_ERR_SILENT;
/* declare us "done" so we can exit cleanly */
opal_condition_signal(&orte_pls_base.orted_cmd_cond);

Просмотреть файл

@ -91,9 +91,7 @@ Returned value %d instead of ORTE_SUCCESS.
[orterun:proc-aborted-strsignal]
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d (%s).
[orterun:abnormal-exit]
WARNING: %s encountered an abnormal exit.
This means that %s exited before it received notification that all
WARNING: %s has exited before it received notification that all
started processes had terminated. You should double check and ensure
that there are no runaway processes still executing.
#
@ -105,12 +103,6 @@ It is dangerous to interrupt %s while it is killing a job (proper
termination may not be guaranteed). Hit control-C again within 1
second if you really want to kill %s immediately.
#
[orterun:forced-end-failed]
WARNING: %s was ordered to kill a job (probably with control-C), but
was unable to successfully complete that order (returned error %s).
You should double check and ensure that there are no runaway processes
still executing.
#
[orterun:empty-prefix]
A prefix was supplied to %s that only contained slashes.

Просмотреть файл

@ -436,8 +436,11 @@ int orterun(int argc, char *argv[])
&orterun_globals.lock);
}
/* check to see if the job was aborted */
if (ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {
ORTE_ERROR_LOG(rc);
if (ORTE_JOBID_INVALID != jobid &&
ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
/* define the exit state as abnormal by default */
exit_state = ORTE_JOB_STATE_ABORTED;
}
@ -472,16 +475,26 @@ int orterun(int argc, char *argv[])
* be sure to include any descendants so nothing is
* left hanging
*/
OBJ_CONSTRUCT(&attributes, opal_list_t);
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
orterun_basename, ORTE_ERROR_NAME(ret));
if (ORTE_JOBID_INVALID != jobid) {
OBJ_CONSTRUCT(&attributes, opal_list_t);
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
orterun_basename, ORTE_ERROR_NAME(ret));
}
while (NULL != (item = opal_list_remove_first(&attributes))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&attributes);
}
while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&attributes);
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
/* If we were forcibly killed, print a warning that the
user may still have some manual cleanup to do. */
if (ORTE_JOBID_INVALID == jobid) {
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
true, orterun_basename, orterun_basename);
}
}
}
@ -732,54 +745,56 @@ static void abort_signal_callback(int fd, short flags, void *arg)
/* If this whole process has already completed, then bail */
switch (state) {
case ABORT_SIGNAL_FIRST:
/* This is the first time through */
state = ABORT_SIGNAL_PROCESSING;
break;
case ABORT_SIGNAL_FIRST:
/* This is the first time through */
state = ABORT_SIGNAL_PROCESSING;
break;
case ABORT_SIGNAL_WARNED:
gettimeofday(&now, NULL);
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
b = now.tv_sec * 1000000 + invoked.tv_usec;
if (b - a <= 1000000) {
/* tell the pls to cancel the terminate request -
* obviously, something is wrong at this point
*/
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
ORTE_ERROR_LOG(ret);
}
/* give the user the warning about manual cleanup */
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
true, orterun_basename, orterun_basename);
case ABORT_SIGNAL_WARNED:
gettimeofday(&now, NULL);
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
b = now.tv_sec * 1000000 + invoked.tv_usec;
if (b - a <= 1000000) {
if (!orterun_globals.quiet){
fprintf(stderr, "%s: forcibly killing job...\n",
orterun_basename);
}
/* tell the pls to cancel the terminate request -
* obviously, something is wrong at this point
*/
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
ORTE_ERROR_LOG(ret);
}
/* We are in an event handler; exit_callback() will delete
the handler that is currently running (which is a Bad
Thing), so we can't call it directly. Instead, we have
to exit this handler and setup to call exit_handler()
after this. */
if (NULL != (event = (opal_event_t*)
malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
now.tv_sec = 0;
now.tv_usec = 0;
opal_evtimer_add(event, &now);
state = ABORT_SIGNAL_DONE;
}
return;
}
/* Otherwise fall through to PROCESSING and warn again */
/* We are in an event handler; exit_callback() will delete
the handler that is currently running (which is a Bad
Thing), so we can't call it directly. Instead, we have
to exit this handler and setup to call exit_handler()
after this. */
if (NULL != (event = (opal_event_t*)
malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
now.tv_sec = 0;
now.tv_usec = 0;
opal_evtimer_add(event, &now);
state = ABORT_SIGNAL_DONE;
}
return;
}
/* Otherwise fall through to PROCESSING and warn again */
case ABORT_SIGNAL_PROCESSING:
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
true, orterun_basename, orterun_basename,
orterun_basename);
gettimeofday(&invoked, NULL);
state = ABORT_SIGNAL_WARNED;
return;
case ABORT_SIGNAL_DONE:
/* Nothing to do -- return */
return;
case ABORT_SIGNAL_PROCESSING:
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
true, orterun_basename, orterun_basename,
orterun_basename);
gettimeofday(&invoked, NULL);
state = ABORT_SIGNAL_WARNED;
return;
case ABORT_SIGNAL_DONE:
/* Nothing to do -- return */
return;
}
if (!orterun_globals.quiet){
@ -794,16 +809,32 @@ static void abort_signal_callback(int fd, short flags, void *arg)
OBJ_CONSTRUCT(&attrs, opal_list_t);
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs);
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
while (NULL != (item = opal_list_remove_first(&attrs))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&attrs);
if (ORTE_SUCCESS != ret) {
opal_show_help("help-orterun.txt", "orterun:forced-end-failed",
true, orterun_basename, ORTE_ERROR_NAME(ret));
/* If we failed the terminate_job() above, then the
condition variable in the main loop in orterun won't
wake up. So signal it. */
if (NULL != (event = (opal_event_t*)
malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
now.tv_sec = 0;
now.tv_usec = 0;
opal_evtimer_add(event, &now);
} else {
/* We really don't want to do this, but everything
else has failed... */
orterun_globals.exit = true;
orterun_globals.exit_status = 1;
opal_condition_signal(&orterun_globals.cond);
}
jobid = ORTE_JOBID_INVALID;
}
}
state = ABORT_SIGNAL_DONE;
state = ABORT_SIGNAL_DONE;
}