Refs trac:726
Tested this functionality quite a bit more and made some fixes: * Print far fewer help messages * Fix one additional deadlock upon error * Change some ORTE_LOG messages to silent (because they're not errors) * Some code got re-indented, sorry... Discussed and reviewed with Ralph. This commit was SVN r13375. The following Trac tickets were found above: Ticket 726 --> https://svn.open-mpi.org/trac/ompi/ticket/726
Этот коммит содержится в:
родитель
78a13bc3ea
Коммит
8d872b195a
@ -106,8 +106,9 @@ int orte_pls_base_orted_cancel_operation(void)
|
||||
/* cancel any waiting receive - we don't want to hear it */
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
|
||||
|
||||
/* set the completion status to reflect cancellation */
|
||||
completion_status = ORTE_ERR_INTERUPTED;
|
||||
/* set the completion status to reflect cancellation -- no need to
|
||||
print anything */
|
||||
completion_status = ORTE_ERR_SILENT;
|
||||
|
||||
/* declare us "done" so we can exit cleanly */
|
||||
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
|
||||
|
@ -91,9 +91,7 @@ Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:proc-aborted-strsignal]
|
||||
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d (%s).
|
||||
[orterun:abnormal-exit]
|
||||
WARNING: %s encountered an abnormal exit.
|
||||
|
||||
This means that %s exited before it received notification that all
|
||||
WARNING: %s has exited before it received notification that all
|
||||
started processes had terminated. You should double check and ensure
|
||||
that there are no runaway processes still executing.
|
||||
#
|
||||
@ -105,12 +103,6 @@ It is dangerous to interrupt %s while it is killing a job (proper
|
||||
termination may not be guaranteed). Hit control-C again within 1
|
||||
second if you really want to kill %s immediately.
|
||||
#
|
||||
[orterun:forced-end-failed]
|
||||
WARNING: %s was ordered to kill a job (probably with control-C), but
|
||||
was unable to successfully complete that order (returned error %s).
|
||||
You should double check and ensure that there are no runaway processes
|
||||
still executing.
|
||||
#
|
||||
[orterun:empty-prefix]
|
||||
A prefix was supplied to %s that only contained slashes.
|
||||
|
||||
|
@ -436,8 +436,11 @@ int orterun(int argc, char *argv[])
|
||||
&orterun_globals.lock);
|
||||
}
|
||||
/* check to see if the job was aborted */
|
||||
if (ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
if (ORTE_JOBID_INVALID != jobid &&
|
||||
ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* define the exit state as abnormal by default */
|
||||
exit_state = ORTE_JOB_STATE_ABORTED;
|
||||
}
|
||||
@ -472,16 +475,26 @@ int orterun(int argc, char *argv[])
|
||||
* be sure to include any descendants so nothing is
|
||||
* left hanging
|
||||
*/
|
||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(ret));
|
||||
if (ORTE_JOBID_INVALID != jobid) {
|
||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
|
||||
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(ret));
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&attributes))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&attributes);
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attributes);
|
||||
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
||||
|
||||
/* If we were forcibly killed, print a warning that the
|
||||
user may still have some manual cleanup to do. */
|
||||
if (ORTE_JOBID_INVALID == jobid) {
|
||||
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
||||
true, orterun_basename, orterun_basename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -732,54 +745,56 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
|
||||
/* If this whole process has already completed, then bail */
|
||||
switch (state) {
|
||||
case ABORT_SIGNAL_FIRST:
|
||||
/* This is the first time through */
|
||||
state = ABORT_SIGNAL_PROCESSING;
|
||||
break;
|
||||
case ABORT_SIGNAL_FIRST:
|
||||
/* This is the first time through */
|
||||
state = ABORT_SIGNAL_PROCESSING;
|
||||
break;
|
||||
|
||||
case ABORT_SIGNAL_WARNED:
|
||||
gettimeofday(&now, NULL);
|
||||
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
||||
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
||||
if (b - a <= 1000000) {
|
||||
/* tell the pls to cancel the terminate request -
|
||||
* obviously, something is wrong at this point
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
/* give the user the warning about manual cleanup */
|
||||
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
||||
true, orterun_basename, orterun_basename);
|
||||
case ABORT_SIGNAL_WARNED:
|
||||
gettimeofday(&now, NULL);
|
||||
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
||||
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
||||
if (b - a <= 1000000) {
|
||||
if (!orterun_globals.quiet){
|
||||
fprintf(stderr, "%s: forcibly killing job...\n",
|
||||
orterun_basename);
|
||||
}
|
||||
|
||||
/* tell the pls to cancel the terminate request -
|
||||
* obviously, something is wrong at this point
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
/* We are in an event handler; exit_callback() will delete
|
||||
the handler that is currently running (which is a Bad
|
||||
Thing), so we can't call it directly. Instead, we have
|
||||
to exit this handler and setup to call exit_handler()
|
||||
after this. */
|
||||
if (NULL != (event = (opal_event_t*)
|
||||
malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, exit_callback, NULL);
|
||||
now.tv_sec = 0;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(event, &now);
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* Otherwise fall through to PROCESSING and warn again */
|
||||
|
||||
/* We are in an event handler; exit_callback() will delete
|
||||
the handler that is currently running (which is a Bad
|
||||
Thing), so we can't call it directly. Instead, we have
|
||||
to exit this handler and setup to call exit_handler()
|
||||
after this. */
|
||||
if (NULL != (event = (opal_event_t*)
|
||||
malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, exit_callback, NULL);
|
||||
now.tv_sec = 0;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(event, &now);
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* Otherwise fall through to PROCESSING and warn again */
|
||||
|
||||
case ABORT_SIGNAL_PROCESSING:
|
||||
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
||||
true, orterun_basename, orterun_basename,
|
||||
orterun_basename);
|
||||
gettimeofday(&invoked, NULL);
|
||||
state = ABORT_SIGNAL_WARNED;
|
||||
return;
|
||||
|
||||
case ABORT_SIGNAL_DONE:
|
||||
/* Nothing to do -- return */
|
||||
return;
|
||||
case ABORT_SIGNAL_PROCESSING:
|
||||
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
||||
true, orterun_basename, orterun_basename,
|
||||
orterun_basename);
|
||||
gettimeofday(&invoked, NULL);
|
||||
state = ABORT_SIGNAL_WARNED;
|
||||
return;
|
||||
|
||||
case ABORT_SIGNAL_DONE:
|
||||
/* Nothing to do -- return */
|
||||
return;
|
||||
}
|
||||
|
||||
if (!orterun_globals.quiet){
|
||||
@ -794,16 +809,32 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||
ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs);
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
opal_show_help("help-orterun.txt", "orterun:forced-end-failed",
|
||||
true, orterun_basename, ORTE_ERROR_NAME(ret));
|
||||
/* If we failed the terminate_job() above, then the
|
||||
condition variable in the main loop in orterun won't
|
||||
wake up. So signal it. */
|
||||
if (NULL != (event = (opal_event_t*)
|
||||
malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, exit_callback, NULL);
|
||||
now.tv_sec = 0;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(event, &now);
|
||||
} else {
|
||||
/* We really don't want to do this, but everything
|
||||
else has failed... */
|
||||
orterun_globals.exit = true;
|
||||
orterun_globals.exit_status = 1;
|
||||
opal_condition_signal(&orterun_globals.cond);
|
||||
}
|
||||
|
||||
jobid = ORTE_JOBID_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user