2004-11-01 16:16:05 +00:00
|
|
|
/*
|
2005-11-05 19:57:48 +00:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 20:09:25 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2006-03-31 00:31:15 +00:00
|
|
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
2004-11-22 01:38:40 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-11-01 16:16:05 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
|
2005-01-20 00:03:23 +00:00
|
|
|
#ifdef HAVE_UNISTD_H
|
2004-11-05 07:52:30 +00:00
|
|
|
#include <unistd.h>
|
2005-01-20 00:03:23 +00:00
|
|
|
#endif
|
2006-03-31 00:31:15 +00:00
|
|
|
#ifdef HAVE_EXECINFO_H
|
|
|
|
#include <execinfo.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_PARAM_H
|
|
|
|
#include <sys/param.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_NETDB_H
|
|
|
|
#include <netdb.h>
|
|
|
|
#endif
|
2004-11-05 07:52:30 +00:00
|
|
|
|
2006-03-31 00:31:15 +00:00
|
|
|
#include "opal/event/event.h"
|
2005-07-04 02:38:44 +00:00
|
|
|
#include "opal/util/show_help.h"
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/util/proc_info.h"
|
2005-07-03 12:07:29 +00:00
|
|
|
#include "orte/runtime/runtime.h"
|
2005-09-12 20:25:01 +00:00
|
|
|
#include "orte/mca/ns/ns.h"
|
|
|
|
#include "orte/mca/rmgr/rmgr.h"
|
2006-03-31 00:31:15 +00:00
|
|
|
#include "ompi/communicator/communicator.h"
|
2005-09-12 20:25:01 +00:00
|
|
|
#include "ompi/proc/proc.h"
|
2006-03-31 00:31:15 +00:00
|
|
|
#include "ompi/runtime/mpiruntime.h"
|
|
|
|
#include "ompi/runtime/params.h"
|
2004-11-01 16:16:05 +00:00
|
|
|
|
|
|
|
#if HAVE_SIGNAL_H
|
|
|
|
#include <signal.h>
|
|
|
|
#endif
|
|
|
|
|
2004-12-14 15:47:31 +00:00
|
|
|
static
|
|
|
|
int
|
|
|
|
abort_procs(ompi_proc_t **procs, int proc_count,
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_jobid_t my_jobid)
|
2004-12-14 15:47:31 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int ret = OMPI_SUCCESS;
|
2005-03-14 20:57:21 +00:00
|
|
|
int killret=OMPI_SUCCESS;
|
|
|
|
orte_jobid_t jobid;
|
2004-12-14 15:47:31 +00:00
|
|
|
|
|
|
|
for (i = 0 ; i < proc_count ; ++i) {
|
2005-03-14 20:57:21 +00:00
|
|
|
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid(&jobid, &(procs[i]->proc_name)))) {
|
|
|
|
return ret;
|
|
|
|
}
|
2004-12-14 15:47:31 +00:00
|
|
|
if (jobid == my_jobid) continue;
|
|
|
|
|
2005-04-07 13:05:02 +00:00
|
|
|
killret = orte_rmgr.terminate_job(jobid);
|
|
|
|
|
2004-12-14 15:47:31 +00:00
|
|
|
if (OMPI_SUCCESS != killret) ret = killret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-11-01 16:16:05 +00:00
|
|
|
int
|
|
|
|
ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|
|
|
int errcode,
|
|
|
|
bool kill_remote_of_intercomm)
|
|
|
|
{
|
2005-03-14 20:57:21 +00:00
|
|
|
orte_jobid_t my_jobid;
|
2006-03-31 00:31:15 +00:00
|
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
char hostname[MAXHOSTNAMELEN];
|
|
|
|
pid_t pid;
|
2004-11-01 16:16:05 +00:00
|
|
|
|
2005-04-13 18:07:55 +00:00
|
|
|
/* Corner case: if we're being called as a result of the
|
|
|
|
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
|
2005-04-15 16:38:44 +00:00
|
|
|
MPI_INIT or after MPI_FINALIZE), then just abort nothing MPI or
|
|
|
|
ORTE has been setup yet. */
|
2005-04-13 18:07:55 +00:00
|
|
|
|
2005-04-15 16:38:44 +00:00
|
|
|
if (!ompi_mpi_initialized || ompi_mpi_finalized) {
|
2005-09-27 20:26:38 +00:00
|
|
|
exit(errcode);
|
|
|
|
}
|
|
|
|
|
2006-03-31 00:31:15 +00:00
|
|
|
/* If we're going to print anything, get the hostname and PID of
|
|
|
|
this process */
|
|
|
|
|
|
|
|
if (ompi_mpi_abort_print_stack ||
|
|
|
|
0 != ompi_mpi_abort_delay) {
|
|
|
|
gethostname(hostname, sizeof(hostname));
|
|
|
|
pid = getpid();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Should we print a stack trace? */
|
|
|
|
|
|
|
|
if (ompi_mpi_abort_print_stack) {
|
|
|
|
#if OMPI_WANT_PRETTY_PRINT_STACKTRACE && ! defined(__WINDOWS__) && defined(HAVE_BACKTRACE)
|
|
|
|
int i;
|
|
|
|
int trace_size;
|
|
|
|
void *trace[32];
|
|
|
|
char **messages = (char **)NULL;
|
|
|
|
|
|
|
|
trace_size = backtrace(trace, 32);
|
|
|
|
messages = backtrace_symbols(trace, trace_size);
|
|
|
|
|
|
|
|
for (i = 0; i < trace_size; ++i) {
|
|
|
|
fprintf(stderr, "[%s:%d] [%d] func:%s\n", hostname, (int) pid,
|
|
|
|
i, messages[i]);
|
|
|
|
fflush(stderr);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Should we wait for a while before aborting? */
|
|
|
|
|
|
|
|
if (0 != ompi_mpi_abort_delay) {
|
|
|
|
if (ompi_mpi_abort_delay < 0) {
|
|
|
|
fprintf(stderr ,"[%s:%d] Looping forever in MPI abort\n",
|
|
|
|
hostname, (int) pid);
|
|
|
|
fflush(stderr);
|
|
|
|
while (1) {
|
|
|
|
sleep(5);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "[%s:%d] Delaying for %d seconds in MPI_abort\n",
|
|
|
|
hostname, (int) pid, ompi_mpi_abort_delay);
|
|
|
|
do {
|
|
|
|
sleep(1);
|
|
|
|
} while (--ompi_mpi_abort_delay > 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-09-27 20:26:38 +00:00
|
|
|
/* BWB - XXX - Should probably publish the error code somewhere */
|
|
|
|
|
|
|
|
/* Kill everyone in the job. We may make this better someday to
|
|
|
|
actually loop over ompi_rte_kill_proc() to only kill the procs
|
|
|
|
in comm, and additionally to somehow use errorcode. */
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_ns.get_jobid(&my_jobid,
|
|
|
|
orte_process_info.my_name))) {
|
|
|
|
/* What else can you do? */
|
|
|
|
exit(errcode);
|
2005-04-15 16:38:44 +00:00
|
|
|
}
|
2005-04-13 18:07:55 +00:00
|
|
|
|
2005-04-15 16:38:44 +00:00
|
|
|
/* kill everyone in the remote group execpt our jobid, if
|
|
|
|
requested */
|
|
|
|
if (kill_remote_of_intercomm && OMPI_COMM_IS_INTER(comm)) {
|
|
|
|
abort_procs(comm->c_remote_group->grp_proc_pointers,
|
|
|
|
comm->c_remote_group->grp_proc_count,
|
2004-12-14 15:47:31 +00:00
|
|
|
my_jobid);
|
|
|
|
}
|
|
|
|
|
2005-04-15 16:38:44 +00:00
|
|
|
/* kill everyone in the local group, except our jobid. */
|
|
|
|
abort_procs(comm->c_local_group->grp_proc_pointers,
|
|
|
|
comm->c_local_group->grp_proc_count,
|
|
|
|
my_jobid);
|
2005-04-07 13:05:02 +00:00
|
|
|
|
|
|
|
ret = orte_rmgr.terminate_job(my_jobid);
|
2004-11-01 16:16:05 +00:00
|
|
|
|
2004-12-13 15:41:59 +00:00
|
|
|
if (OMPI_SUCCESS == ret) {
|
2004-11-01 16:16:05 +00:00
|
|
|
while (1) {
|
2004-12-13 15:41:59 +00:00
|
|
|
/* We should never really get here, since
|
|
|
|
ompi_rte_terminate_job shouldn't return until the job
|
|
|
|
is actually dead. But just in case there are some
|
|
|
|
race conditions, keep progressing the event loop until
|
|
|
|
we get killed */
|
2005-07-03 23:09:55 +00:00
|
|
|
if (!OMPI_ENABLE_PROGRESS_THREADS || opal_event_progress_thread()) {
|
|
|
|
opal_event_loop(0);
|
2004-11-01 16:16:05 +00:00
|
|
|
} else {
|
|
|
|
sleep(1000);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2004-12-13 15:41:59 +00:00
|
|
|
/* If ret isn't OMPI_SUCCESS, then the rest of the job is
|
|
|
|
still running. But we can't really do anything about that, so
|
|
|
|
just exit and let it become Somebody Elses Problem. */
|
2005-09-27 20:26:38 +00:00
|
|
|
exit(errcode);
|
2004-11-01 16:16:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|