Part 1 of the fix for ticket #726. This commit adds logic to orteun
to effect the following: * The first time the user hits ctrl-c, we go into the process of killing the ORTE job (this is not new). * While waiting for the job to actually terminate, if the user hits ctrl-c a second time, we print a warning saying "Hey, I'm still trying to kill the job. If you *really* want me to die immediately, hit ctrl-c again within 1 second." * If the user hits ctrl-c a within 1 second, orterun quits with a warning about how the job may not have actually been killed. Note that none of this logic won't really work until the second part of the fix for #726 is also committed (i.e., make pls.terminate_job() non-blocking). So I'm now throwing the ticket over to Ralph for the second part of the fix... Refs trac:726 This commit was SVN r13040. The following Trac tickets were found above: Ticket 726 --> https://svn.open-mpi.org/trac/ompi/ticket/726
Этот коммит содержится в:
родитель
65b04f295a
Коммит
8a289cf1cb
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -95,6 +96,13 @@ WARNING: %s encountered an abnormal exit.
|
||||
This means that %s exited before it received notification that all
|
||||
started processes had terminated. You should double check and ensure
|
||||
that there are no runaway processes still executing.
|
||||
[orterun:sigint-while-processing]
|
||||
WARNING: %s is in the process of killing a job, but has detected an
|
||||
interruption (probably control-C).
|
||||
|
||||
It is dangerous to interrupt %s while it is killing a job (proper
|
||||
termination may not be guaranteed). Hit control-C again within 1
|
||||
second if you really want to kill %s immediately.
|
||||
[orterun:empty-prefix]
|
||||
A prefix was supplied to %s that only contained slashes.
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -706,6 +706,13 @@ static void exit_callback(int fd, short event, void *arg)
|
||||
* the job has been aborted.
|
||||
*/
|
||||
|
||||
typedef enum {
|
||||
ABORT_SIGNAL_FIRST,
|
||||
ABORT_SIGNAL_PROCESSING,
|
||||
ABORT_SIGNAL_WARNED,
|
||||
ABORT_SIGNAL_DONE
|
||||
} abort_signal_state_t;
|
||||
|
||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
{
|
||||
int ret;
|
||||
@ -713,14 +720,54 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
opal_event_t* event;
|
||||
opal_list_t attrs;
|
||||
opal_list_item_t *item;
|
||||
|
||||
static int signalled = 0;
|
||||
static abort_signal_state_t state;
|
||||
static struct timeval invoked, now;
|
||||
double a, b;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (0 != signalled++) {
|
||||
return;
|
||||
/* If this whole process has already completed, then bail */
|
||||
switch (state) {
|
||||
case ABORT_SIGNAL_FIRST:
|
||||
/* This is the first time through */
|
||||
state = ABORT_SIGNAL_PROCESSING;
|
||||
break;
|
||||
|
||||
case ABORT_SIGNAL_WARNED:
|
||||
gettimeofday(&now, NULL);
|
||||
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
||||
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
||||
if (b - a <= 1000000) {
|
||||
/* We are in an event handler; exit_callback() will delete
|
||||
the handler that is currently running (which is a Bad
|
||||
Thing), so we can't call it directly. Instead, we have
|
||||
to exit this handler and setup to call exit_handler()
|
||||
after this. */
|
||||
if (NULL != (event = (opal_event_t*)
|
||||
malloc(sizeof(opal_event_t)))) {
|
||||
opal_evtimer_set(event, exit_callback, NULL);
|
||||
now.tv_sec = 0;
|
||||
now.tv_usec = 0;
|
||||
opal_evtimer_add(event, &now);
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* Otherwise fall through to PROCESSING and warn again */
|
||||
|
||||
case ABORT_SIGNAL_PROCESSING:
|
||||
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
||||
true, orterun_basename, orterun_basename,
|
||||
orterun_basename);
|
||||
gettimeofday(&invoked, NULL);
|
||||
state = ABORT_SIGNAL_WARNED;
|
||||
return;
|
||||
|
||||
case ABORT_SIGNAL_DONE:
|
||||
/* Nothing to do -- return */
|
||||
return;
|
||||
}
|
||||
|
||||
if (!orterun_globals.quiet){
|
||||
fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
|
||||
}
|
||||
@ -746,6 +793,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
||||
opal_evtimer_add(event, &tv);
|
||||
}
|
||||
|
||||
state = ABORT_SIGNAL_DONE;
|
||||
}
|
||||
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user