Part 1 of the fix for ticket #726. This commit adds logic to orteun
to effect the following: * The first time the user hits ctrl-c, we go into the process of killing the ORTE job (this is not new). * While waiting for the job to actually terminate, if the user hits ctrl-c a second time, we print a warning saying "Hey, I'm still trying to kill the job. If you *really* want me to die immediately, hit ctrl-c again within 1 second." * If the user hits ctrl-c a within 1 second, orterun quits with a warning about how the job may not have actually been killed. Note that none of this logic won't really work until the second part of the fix for #726 is also committed (i.e., make pls.terminate_job() non-blocking). So I'm now throwing the ticket over to Ralph for the second part of the fix... Refs trac:726 This commit was SVN r13040. The following Trac tickets were found above: Ticket 726 --> https://svn.open-mpi.org/trac/ompi/ticket/726
Этот коммит содержится в:
родитель
65b04f295a
Коммит
8a289cf1cb
@ -10,6 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -95,6 +96,13 @@ WARNING: %s encountered an abnormal exit.
|
|||||||
This means that %s exited before it received notification that all
|
This means that %s exited before it received notification that all
|
||||||
started processes had terminated. You should double check and ensure
|
started processes had terminated. You should double check and ensure
|
||||||
that there are no runaway processes still executing.
|
that there are no runaway processes still executing.
|
||||||
|
[orterun:sigint-while-processing]
|
||||||
|
WARNING: %s is in the process of killing a job, but has detected an
|
||||||
|
interruption (probably control-C).
|
||||||
|
|
||||||
|
It is dangerous to interrupt %s while it is killing a job (proper
|
||||||
|
termination may not be guaranteed). Hit control-C again within 1
|
||||||
|
second if you really want to kill %s immediately.
|
||||||
[orterun:empty-prefix]
|
[orterun:empty-prefix]
|
||||||
A prefix was supplied to %s that only contained slashes.
|
A prefix was supplied to %s that only contained slashes.
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -706,6 +706,13 @@ static void exit_callback(int fd, short event, void *arg)
|
|||||||
* the job has been aborted.
|
* the job has been aborted.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
ABORT_SIGNAL_FIRST,
|
||||||
|
ABORT_SIGNAL_PROCESSING,
|
||||||
|
ABORT_SIGNAL_WARNED,
|
||||||
|
ABORT_SIGNAL_DONE
|
||||||
|
} abort_signal_state_t;
|
||||||
|
|
||||||
static void abort_signal_callback(int fd, short flags, void *arg)
|
static void abort_signal_callback(int fd, short flags, void *arg)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
@ -713,14 +720,54 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
|||||||
opal_event_t* event;
|
opal_event_t* event;
|
||||||
opal_list_t attrs;
|
opal_list_t attrs;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
|
static abort_signal_state_t state;
|
||||||
static int signalled = 0;
|
static struct timeval invoked, now;
|
||||||
|
double a, b;
|
||||||
|
|
||||||
OPAL_TRACE(1);
|
OPAL_TRACE(1);
|
||||||
|
|
||||||
if (0 != signalled++) {
|
/* If this whole process has already completed, then bail */
|
||||||
return;
|
switch (state) {
|
||||||
|
case ABORT_SIGNAL_FIRST:
|
||||||
|
/* This is the first time through */
|
||||||
|
state = ABORT_SIGNAL_PROCESSING;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ABORT_SIGNAL_WARNED:
|
||||||
|
gettimeofday(&now, NULL);
|
||||||
|
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
|
||||||
|
b = now.tv_sec * 1000000 + invoked.tv_usec;
|
||||||
|
if (b - a <= 1000000) {
|
||||||
|
/* We are in an event handler; exit_callback() will delete
|
||||||
|
the handler that is currently running (which is a Bad
|
||||||
|
Thing), so we can't call it directly. Instead, we have
|
||||||
|
to exit this handler and setup to call exit_handler()
|
||||||
|
after this. */
|
||||||
|
if (NULL != (event = (opal_event_t*)
|
||||||
|
malloc(sizeof(opal_event_t)))) {
|
||||||
|
opal_evtimer_set(event, exit_callback, NULL);
|
||||||
|
now.tv_sec = 0;
|
||||||
|
now.tv_usec = 0;
|
||||||
|
opal_evtimer_add(event, &now);
|
||||||
|
state = ABORT_SIGNAL_DONE;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/* Otherwise fall through to PROCESSING and warn again */
|
||||||
|
|
||||||
|
case ABORT_SIGNAL_PROCESSING:
|
||||||
|
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
|
||||||
|
true, orterun_basename, orterun_basename,
|
||||||
|
orterun_basename);
|
||||||
|
gettimeofday(&invoked, NULL);
|
||||||
|
state = ABORT_SIGNAL_WARNED;
|
||||||
|
return;
|
||||||
|
|
||||||
|
case ABORT_SIGNAL_DONE:
|
||||||
|
/* Nothing to do -- return */
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!orterun_globals.quiet){
|
if (!orterun_globals.quiet){
|
||||||
fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
|
fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
|
||||||
}
|
}
|
||||||
@ -746,6 +793,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
|||||||
opal_evtimer_add(event, &tv);
|
opal_evtimer_add(event, &tv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
state = ABORT_SIGNAL_DONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user