Refs trac:726
Tested this functionality quite a bit more and made some fixes: * Print far fewer help messages * Fix one additional deadlock upon error * Change some ORTE_LOG messages to silent (because they're not errors) * Some code got re-indented, sorry... Discussed and reviewed with Ralph. This commit was SVN r13375. The following Trac tickets were found above: Ticket 726 --> https://svn.open-mpi.org/trac/ompi/ticket/726
Этот коммит содержится в:
родитель
78a13bc3ea
Коммит
8d872b195a
@ -106,8 +106,9 @@ int orte_pls_base_orted_cancel_operation(void)
|
|||||||
/* cancel any waiting receive - we don't want to hear it */
|
/* cancel any waiting receive - we don't want to hear it */
|
||||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
|
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK);
|
||||||
|
|
||||||
/* set the completion status to reflect cancellation */
|
/* set the completion status to reflect cancellation -- no need to
|
||||||
completion_status = ORTE_ERR_INTERUPTED;
|
print anything */
|
||||||
|
completion_status = ORTE_ERR_SILENT;
|
||||||
|
|
||||||
/* declare us "done" so we can exit cleanly */
|
/* declare us "done" so we can exit cleanly */
|
||||||
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
|
opal_condition_signal(&orte_pls_base.orted_cmd_cond);
|
||||||
|
@ -91,9 +91,7 @@ Returned value %d instead of ORTE_SUCCESS.
|
|||||||
[orterun:proc-aborted-strsignal]
|
[orterun:proc-aborted-strsignal]
|
||||||
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d (%s).
|
%s noticed that job rank %lu with PID %lu on node %s exited on signal %d (%s).
|
||||||
[orterun:abnormal-exit]
|
[orterun:abnormal-exit]
|
||||||
WARNING: %s encountered an abnormal exit.
|
WARNING: %s has exited before it received notification that all
|
||||||
|
|
||||||
This means that %s exited before it received notification that all
|
|
||||||
started processes had terminated. You should double check and ensure
|
started processes had terminated. You should double check and ensure
|
||||||
that there are no runaway processes still executing.
|
that there are no runaway processes still executing.
|
||||||
#
|
#
|
||||||
@ -105,12 +103,6 @@ It is dangerous to interrupt %s while it is killing a job (proper
|
|||||||
termination may not be guaranteed). Hit control-C again within 1
|
termination may not be guaranteed). Hit control-C again within 1
|
||||||
second if you really want to kill %s immediately.
|
second if you really want to kill %s immediately.
|
||||||
#
|
#
|
||||||
[orterun:forced-end-failed]
|
|
||||||
WARNING: %s was ordered to kill a job (probably with control-C), but
|
|
||||||
was unable to successfully complete that order (returned error %s).
|
|
||||||
You should double check and ensure that there are no runaway processes
|
|
||||||
still executing.
|
|
||||||
#
|
|
||||||
[orterun:empty-prefix]
|
[orterun:empty-prefix]
|
||||||
A prefix was supplied to %s that only contained slashes.
|
A prefix was supplied to %s that only contained slashes.
|
||||||
|
|
||||||
|
@ -436,8 +436,11 @@ int orterun(int argc, char *argv[])
|
|||||||
&orterun_globals.lock);
|
&orterun_globals.lock);
|
||||||
}
|
}
|
||||||
/* check to see if the job was aborted */
|
/* check to see if the job was aborted */
|
||||||
if (ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {
|
if (ORTE_JOBID_INVALID != jobid &&
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_SUCCESS != (rc = orte_smr.get_job_state(&exit_state, jobid))) {
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
}
|
||||||
/* define the exit state as abnormal by default */
|
/* define the exit state as abnormal by default */
|
||||||
exit_state = ORTE_JOB_STATE_ABORTED;
|
exit_state = ORTE_JOB_STATE_ABORTED;
|
||||||
}
|
}
|
||||||
@ -472,16 +475,26 @@ int orterun(int argc, char *argv[])
|
|||||||
* be sure to include any descendants so nothing is
|
* be sure to include any descendants so nothing is
|
||||||
* left hanging
|
* left hanging
|
||||||
*/
|
*/
|
||||||
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
if (ORTE_JOBID_INVALID != jobid) {
|
||||||
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
OBJ_CONSTRUCT(&attributes, opal_list_t);
|
||||||
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
|
orte_rmgr.add_attribute(&attributes, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||||
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
|
if (ORTE_SUCCESS != (ret = orte_pls.terminate_orteds(jobid, &orte_abort_timeout, &attributes))) {
|
||||||
orterun_basename, ORTE_ERROR_NAME(ret));
|
opal_show_help("help-orterun.txt", "orterun:daemon-die", true,
|
||||||
|
orterun_basename, ORTE_ERROR_NAME(ret));
|
||||||
|
}
|
||||||
|
while (NULL != (item = opal_list_remove_first(&attributes))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&attributes);
|
||||||
}
|
}
|
||||||
while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item);
|
|
||||||
OBJ_DESTRUCT(&attributes);
|
|
||||||
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
||||||
|
|
||||||
|
/* If we were forcibly killed, print a warning that the
|
||||||
|
user may still have some manual cleanup to do. */
|
||||||
|
if (ORTE_JOBID_INVALID == jobid) {
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
||||||
|
true, orterun_basename, orterun_basename);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -732,54 +745,56 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
|||||||
|
|
||||||
/* If this whole process has already completed, then bail */
|
/* If this whole process has already completed, then bail */
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case ABORT_SIGNAL_FIRST:
|
case ABORT_SIGNAL_FIRST:
|
||||||
/* This is the first time through */
|
/* This is the first time through */
|
||||||
state = ABORT_SIGNAL_PROCESSING;
|
state = ABORT_SIGNAL_PROCESSING;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ABORT_SIGNAL_WARNED:
|
case ABORT_SIGNAL_WARNED:
|
||||||
gettimeofday(&now, NULL);
|
gettimeofday(&now, NULL);
|
||||||
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
||||||
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
||||||
if (b - a <= 1000000) {
|
if (b - a <= 1000000) {
|
||||||
/* tell the pls to cancel the terminate request -
|
if (!orterun_globals.quiet){
|
||||||
* obviously, something is wrong at this point
|
fprintf(stderr, "%s: forcibly killing job...\n",
|
||||||
*/
|
orterun_basename);
|
||||||
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
}
|
|
||||||
/* give the user the warning about manual cleanup */
|
|
||||||
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
|
||||||
true, orterun_basename, orterun_basename);
|
|
||||||
|
|
||||||
/* We are in an event handler; exit_callback() will delete
|
|
||||||
the handler that is currently running (which is a Bad
|
|
||||||
Thing), so we can't call it directly. Instead, we have
|
|
||||||
to exit this handler and setup to call exit_handler()
|
|
||||||
after this. */
|
|
||||||
if (NULL != (event = (opal_event_t*)
|
|
||||||
malloc(sizeof(opal_event_t)))) {
|
|
||||||
opal_evtimer_set(event, exit_callback, NULL);
|
|
||||||
now.tv_sec = 0;
|
|
||||||
now.tv_usec = 0;
|
|
||||||
opal_evtimer_add(event, &now);
|
|
||||||
state = ABORT_SIGNAL_DONE;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
/* Otherwise fall through to PROCESSING and warn again */
|
|
||||||
|
|
||||||
case ABORT_SIGNAL_PROCESSING:
|
/* tell the pls to cancel the terminate request -
|
||||||
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
* obviously, something is wrong at this point
|
||||||
true, orterun_basename, orterun_basename,
|
*/
|
||||||
orterun_basename);
|
if (ORTE_SUCCESS != (ret = orte_pls.cancel_operation())) {
|
||||||
gettimeofday(&invoked, NULL);
|
ORTE_ERROR_LOG(ret);
|
||||||
state = ABORT_SIGNAL_WARNED;
|
}
|
||||||
return;
|
|
||||||
|
|
||||||
case ABORT_SIGNAL_DONE:
|
/* We are in an event handler; exit_callback() will delete
|
||||||
/* Nothing to do -- return */
|
the handler that is currently running (which is a Bad
|
||||||
return;
|
Thing), so we can't call it directly. Instead, we have
|
||||||
|
to exit this handler and setup to call exit_handler()
|
||||||
|
after this. */
|
||||||
|
if (NULL != (event = (opal_event_t*)
|
||||||
|
malloc(sizeof(opal_event_t)))) {
|
||||||
|
opal_evtimer_set(event, exit_callback, NULL);
|
||||||
|
now.tv_sec = 0;
|
||||||
|
now.tv_usec = 0;
|
||||||
|
opal_evtimer_add(event, &now);
|
||||||
|
state = ABORT_SIGNAL_DONE;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/* Otherwise fall through to PROCESSING and warn again */
|
||||||
|
|
||||||
|
case ABORT_SIGNAL_PROCESSING:
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
||||||
|
true, orterun_basename, orterun_basename,
|
||||||
|
orterun_basename);
|
||||||
|
gettimeofday(&invoked, NULL);
|
||||||
|
state = ABORT_SIGNAL_WARNED;
|
||||||
|
return;
|
||||||
|
|
||||||
|
case ABORT_SIGNAL_DONE:
|
||||||
|
/* Nothing to do -- return */
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!orterun_globals.quiet){
|
if (!orterun_globals.quiet){
|
||||||
@ -794,17 +809,33 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
|||||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||||
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
|
||||||
ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs);
|
ret = orte_pls.terminate_job(jobid, &orte_abort_timeout, &attrs);
|
||||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
while (NULL != (item = opal_list_remove_first(&attrs))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
OBJ_DESTRUCT(&attrs);
|
OBJ_DESTRUCT(&attrs);
|
||||||
if (ORTE_SUCCESS != ret) {
|
if (ORTE_SUCCESS != ret) {
|
||||||
opal_show_help("help-orterun.txt", "orterun:forced-end-failed",
|
/* If we failed the terminate_job() above, then the
|
||||||
true, orterun_basename, ORTE_ERROR_NAME(ret));
|
condition variable in the main loop in orterun won't
|
||||||
|
wake up. So signal it. */
|
||||||
|
if (NULL != (event = (opal_event_t*)
|
||||||
|
malloc(sizeof(opal_event_t)))) {
|
||||||
|
opal_evtimer_set(event, exit_callback, NULL);
|
||||||
|
now.tv_sec = 0;
|
||||||
|
now.tv_usec = 0;
|
||||||
|
opal_evtimer_add(event, &now);
|
||||||
|
} else {
|
||||||
|
/* We really don't want to do this, but everything
|
||||||
|
else has failed... */
|
||||||
|
orterun_globals.exit = true;
|
||||||
|
orterun_globals.exit_status = 1;
|
||||||
|
opal_condition_signal(&orterun_globals.cond);
|
||||||
|
}
|
||||||
|
|
||||||
jobid = ORTE_JOBID_INVALID;
|
jobid = ORTE_JOBID_INVALID;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
state = ABORT_SIGNAL_DONE;
|
state = ABORT_SIGNAL_DONE;
|
||||||
state = ABORT_SIGNAL_DONE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user