1
1

Part 1 of the fix for ticket #726. This commit adds logic to orteun

to effect the following:

 * The first time the user hits ctrl-c, we go into the process of
   killing the ORTE job (this is not new).
 * While waiting for the job to actually terminate, if the user hits
   ctrl-c a second time, we print a warning saying "Hey, I'm still
   trying to kill the job.  If you *really* want me to die
   immediately, hit ctrl-c again within 1 second."
 * If the user hits ctrl-c a within 1 second, orterun quits with a
   warning about how the job may not have actually been killed.

Note that none of this logic won't really work until the second part
of the fix for #726 is also committed (i.e., make pls.terminate_job()
non-blocking).  So I'm now throwing the ticket over to Ralph for the
second part of the fix...

Refs trac:726

This commit was SVN r13040.

The following Trac tickets were found above:
  Ticket 726 --> https://svn.open-mpi.org/trac/ompi/ticket/726
Этот коммит содержится в:
Jeff Squyres 2007-01-08 20:25:26 +00:00
родитель 65b04f295a
Коммит 8a289cf1cb
2 изменённых файлов: 61 добавлений и 5 удалений

Просмотреть файл

@ -10,6 +10,7 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -95,6 +96,13 @@ WARNING: %s encountered an abnormal exit.
This means that %s exited before it received notification that all This means that %s exited before it received notification that all
started processes had terminated. You should double check and ensure started processes had terminated. You should double check and ensure
that there are no runaway processes still executing. that there are no runaway processes still executing.
[orterun:sigint-while-processing]
WARNING: %s is in the process of killing a job, but has detected an
interruption (probably control-C).
It is dangerous to interrupt %s while it is killing a job (proper
termination may not be guaranteed). Hit control-C again within 1
second if you really want to kill %s immediately.
[orterun:empty-prefix] [orterun:empty-prefix]
A prefix was supplied to %s that only contained slashes. A prefix was supplied to %s that only contained slashes.

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -706,6 +706,13 @@ static void exit_callback(int fd, short event, void *arg)
* the job has been aborted. * the job has been aborted.
*/ */
typedef enum {
ABORT_SIGNAL_FIRST,
ABORT_SIGNAL_PROCESSING,
ABORT_SIGNAL_WARNED,
ABORT_SIGNAL_DONE
} abort_signal_state_t;
static void abort_signal_callback(int fd, short flags, void *arg) static void abort_signal_callback(int fd, short flags, void *arg)
{ {
int ret; int ret;
@ -713,14 +720,54 @@ static void abort_signal_callback(int fd, short flags, void *arg)
opal_event_t* event; opal_event_t* event;
opal_list_t attrs; opal_list_t attrs;
opal_list_item_t *item; opal_list_item_t *item;
static abort_signal_state_t state;
static int signalled = 0; static struct timeval invoked, now;
double a, b;
OPAL_TRACE(1); OPAL_TRACE(1);
if (0 != signalled++) { /* If this whole process has already completed, then bail */
return; switch (state) {
case ABORT_SIGNAL_FIRST:
/* This is the first time through */
state = ABORT_SIGNAL_PROCESSING;
break;
case ABORT_SIGNAL_WARNED:
gettimeofday(&now, NULL);
a = invoked.tv_sec * 1000000 + invoked.tv_usec;
b = now.tv_sec * 1000000 + invoked.tv_usec;
if (b - a <= 1000000) {
/* We are in an event handler; exit_callback() will delete
the handler that is currently running (which is a Bad
Thing), so we can't call it directly. Instead, we have
to exit this handler and setup to call exit_handler()
after this. */
if (NULL != (event = (opal_event_t*)
malloc(sizeof(opal_event_t)))) {
opal_evtimer_set(event, exit_callback, NULL);
now.tv_sec = 0;
now.tv_usec = 0;
opal_evtimer_add(event, &now);
state = ABORT_SIGNAL_DONE;
}
return;
}
/* Otherwise fall through to PROCESSING and warn again */
case ABORT_SIGNAL_PROCESSING:
opal_show_help("help-orterun.txt", "orterun:sigint-while-processing",
true, orterun_basename, orterun_basename,
orterun_basename);
gettimeofday(&invoked, NULL);
state = ABORT_SIGNAL_WARNED;
return;
case ABORT_SIGNAL_DONE:
/* Nothing to do -- return */
return;
} }
if (!orterun_globals.quiet){ if (!orterun_globals.quiet){
fprintf(stderr, "%s: killing job...\n\n", orterun_basename); fprintf(stderr, "%s: killing job...\n\n", orterun_basename);
} }
@ -746,6 +793,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
opal_evtimer_add(event, &tv); opal_evtimer_add(event, &tv);
} }
state = ABORT_SIGNAL_DONE;
} }