2004-11-01 19:16:05 +03:00
/*
2005-11-05 22:57:48 +03:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
2011-10-04 18:50:31 +04:00
* Copyright ( c ) 2004 - 2011 The University of Tennessee and The University
2005-11-05 22:57:48 +03:00
* of Tennessee Research Foundation . All rights
* reserved .
2004-11-28 23:09:25 +03:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
2005-03-24 15:43:37 +03:00
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2011-03-07 19:45:45 +03:00
* Copyright ( c ) 2006 - 2011 Cisco Systems , Inc . All rights reserved .
2011-06-15 17:10:13 +04:00
* Copyright ( c ) 2010 - 2011 Oak Ridge National Labs . All rights reserved .
2004-11-22 04:38:40 +03:00
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
2004-11-01 19:16:05 +03:00
* $ HEADER $
*/
# include "ompi_config.h"
2005-01-20 03:03:23 +03:00
# ifdef HAVE_UNISTD_H
2004-11-05 10:52:30 +03:00
# include <unistd.h>
2005-01-20 03:03:23 +03:00
# endif
2006-03-31 04:31:15 +04:00
# ifdef HAVE_SYS_TYPES_H
# include <sys/types.h>
# endif
# ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
# endif
# ifdef HAVE_NETDB_H
# include <netdb.h>
# endif
2004-11-05 10:52:30 +03:00
2006-09-22 19:04:04 +04:00
# include "opal/mca/backtrace/backtrace.h"
2006-02-12 04:33:29 +03:00
# include "orte/util/proc_info.h"
2005-07-03 16:07:29 +04:00
# include "orte/runtime/runtime.h"
2008-02-28 04:57:57 +03:00
# include "orte/runtime/orte_globals.h"
# include "orte/util/name_fns.h"
2011-03-07 19:45:45 +03:00
# include "orte/util/show_help.h"
2006-09-15 01:29:51 +04:00
# include "orte/mca/errmgr/errmgr.h"
2006-03-31 04:31:15 +04:00
# include "ompi/communicator/communicator.h"
# include "ompi/runtime/mpiruntime.h"
# include "ompi/runtime/params.h"
2008-09-20 15:34:37 +04:00
# include "ompi/debuggers/debuggers.h"
# include "ompi/errhandler/errcode.h"
2004-11-01 19:16:05 +03:00
2007-01-30 01:01:28 +03:00
static bool have_been_invoked = false ;
2004-12-14 18:47:31 +03:00
2004-11-01 19:16:05 +03:00
int
ompi_mpi_abort ( struct ompi_communicator_t * comm ,
int errcode ,
bool kill_remote_of_intercomm )
{
2011-06-15 17:10:13 +04:00
int count = 0 , i , ret ;
2008-09-20 15:34:37 +04:00
char * msg , * host , hostname [ MAXHOSTNAMELEN ] ;
2006-04-01 16:41:48 +04:00
pid_t pid = 0 ;
2006-09-22 19:04:04 +04:00
orte_process_name_t * abort_procs ;
orte_std_cntr_t nabort_procs ;
2007-01-30 01:01:28 +03:00
/* Protection for recursive invocation */
if ( have_been_invoked ) {
return OMPI_SUCCESS ;
}
have_been_invoked = true ;
/* If ORTE is initialized, use its nodename. Otherwise, call
gethostname . */
if ( orte_initialized ) {
2009-03-06 00:56:03 +03:00
host = orte_process_info . nodename ;
2007-01-30 01:01:28 +03:00
} else {
2011-11-30 03:24:52 +04:00
gethostname ( hostname , sizeof ( hostname ) ) ;
2007-01-30 01:01:28 +03:00
host = hostname ;
}
pid = getpid ( ) ;
2011-03-07 19:45:45 +03:00
/* Should we print a stack trace? Not aggregated because they
might be different on all processes . */
2006-03-31 04:31:15 +04:00
if ( ompi_mpi_abort_print_stack ) {
2006-09-22 19:04:04 +04:00
char * * messages ;
int len , i ;
if ( OMPI_SUCCESS = = opal_backtrace_buffer ( & messages , & len ) ) {
for ( i = 0 ; i < len ; + + i ) {
2007-01-30 01:01:28 +03:00
fprintf ( stderr , " [%s:%d] [%d] func:%s \n " , host , ( int ) pid ,
2006-09-22 19:04:04 +04:00
i , messages [ i ] ) ;
fflush ( stderr ) ;
}
free ( messages ) ;
} else {
2007-01-05 01:30:28 +03:00
/* This will print an message if it's unable to print the
backtrace , so we don ' t need an additional " else " clause
if opal_backtrace_print ( ) is not supported . */
opal_backtrace_print ( stderr ) ;
2006-03-31 04:31:15 +04:00
}
}
2008-09-20 15:34:37 +04:00
/* Notify the debugger that we're about to abort */
2008-10-02 01:42:08 +04:00
if ( errcode < 0 | |
asprintf ( & msg , " [%s:%d] aborting with MPI error %s%s " ,
2008-09-20 15:34:37 +04:00
host , ( int ) pid , ompi_mpi_errnum_get_string ( errcode ) ,
ompi_mpi_abort_print_stack ?
" (stack trace available on stderr) " : " " ) < 0 ) {
msg = NULL ;
}
ompi_debugger_notify_abort ( msg ) ;
if ( NULL ! = msg ) {
free ( msg ) ;
}
2006-03-31 04:31:15 +04:00
/* Should we wait for a while before aborting? */
if ( 0 ! = ompi_mpi_abort_delay ) {
if ( ompi_mpi_abort_delay < 0 ) {
2007-01-05 01:30:28 +03:00
fprintf ( stderr , " [%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0) \n " ,
2007-01-30 01:01:28 +03:00
host , ( int ) pid ) ;
2006-03-31 04:31:15 +04:00
fflush ( stderr ) ;
while ( 1 ) {
sleep ( 5 ) ;
}
} else {
2007-01-05 01:30:28 +03:00
fprintf ( stderr , " [%s:%d] Delaying for %d seconds before aborting \n " ,
2007-01-30 01:01:28 +03:00
host , ( int ) pid , ompi_mpi_abort_delay ) ;
2006-03-31 04:31:15 +04:00
do {
sleep ( 1 ) ;
} while ( - - ompi_mpi_abort_delay > 0 ) ;
}
}
2008-03-24 19:25:14 +03:00
/* If OMPI isn't setup yet/any more, then don't even try killing
everyone . Ditto for ORTE ( e . g . , ORTE may be initialized before
MPI_INIT is over , but ompi_initialized will be false because
communicators are not setup yet ) . Sorry , Charlie . . . */
2007-01-30 01:01:28 +03:00
2008-03-24 19:25:14 +03:00
if ( ! orte_initialized | | ! ompi_mpi_initialized | | ompi_mpi_finalized ) {
2011-03-07 19:45:45 +03:00
if ( orte_show_help_is_available ( ) ) {
orte_show_help ( " help-mpi-runtime.txt " ,
" ompi mpi abort:cannot guarantee all killed " ,
true ,
( ompi_mpi_finalized ?
" After MPI_FINALIZE was invoked " :
( ompi_mpi_init_started ?
" Before MPI_INIT completed " :
" Before MPI_INIT was invoked " ) ) ,
host , ( int ) pid ) ;
} else {
fprintf ( stderr , " [%s:%d] Local abort %s completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed! \n " ,
host , ( int ) pid , ompi_mpi_finalized ?
" after MPI_FINALIZE " : " before MPI_INIT " ) ;
}
2007-01-30 01:01:28 +03:00
exit ( errcode ) ;
}
2006-09-22 19:04:04 +04:00
/* abort local procs in the communicator. If the communicator is
an intercommunicator AND the abort has explicitly requested
that we abort the remote procs , then do that as well . */
nabort_procs = ompi_comm_size ( comm ) ;
2005-09-28 00:26:38 +04:00
2006-09-22 19:04:04 +04:00
if ( kill_remote_of_intercomm ) {
/* ompi_comm_remote_size() returns 0 if not an intercomm, so
this is cool */
nabort_procs + = ompi_comm_remote_size ( comm ) ;
2005-04-15 20:38:44 +04:00
}
2005-04-13 22:07:55 +04:00
2006-10-05 09:07:43 +04:00
abort_procs = ( orte_process_name_t * ) malloc ( sizeof ( orte_process_name_t ) * nabort_procs ) ;
2006-09-22 19:04:04 +04:00
if ( NULL = = abort_procs ) {
/* quick clean orte and get out */
2008-02-28 04:57:57 +03:00
orte_errmgr . abort ( errcode , " Abort unable to malloc memory to kill procs " ) ;
2006-09-22 19:04:04 +04:00
}
/* put all the local procs in the abort list */
for ( i = 0 ; i < ompi_comm_size ( comm ) ; + + i ) {
2008-02-28 04:57:57 +03:00
if ( OPAL_EQUAL ! = orte_util_compare_name_fields ( ORTE_NS_CMP_ALL ,
2006-09-22 19:04:04 +04:00
& comm - > c_local_group - > grp_proc_pointers [ i ] - > proc_name ,
2008-02-28 04:57:57 +03:00
ORTE_PROC_MY_NAME ) ) {
2006-09-22 19:04:04 +04:00
assert ( count < = nabort_procs ) ;
abort_procs [ count + + ] = comm - > c_local_group - > grp_proc_pointers [ i ] - > proc_name ;
} else {
/* don't terminate me just yet */
nabort_procs - - ;
}
2004-12-14 18:47:31 +03:00
}
2006-09-22 19:04:04 +04:00
/* if requested, kill off remote procs too */
if ( kill_remote_of_intercomm ) {
for ( i = 0 ; i < ompi_comm_remote_size ( comm ) ; + + i ) {
2008-02-28 04:57:57 +03:00
if ( OPAL_EQUAL ! = orte_util_compare_name_fields ( ORTE_NS_CMP_ALL ,
2006-09-22 19:04:04 +04:00
& comm - > c_remote_group - > grp_proc_pointers [ i ] - > proc_name ,
2008-02-28 04:57:57 +03:00
ORTE_PROC_MY_NAME ) ) {
2006-09-22 19:04:04 +04:00
assert ( count < = nabort_procs ) ;
abort_procs [ count + + ] =
comm - > c_remote_group - > grp_proc_pointers [ i ] - > proc_name ;
2004-11-01 19:16:05 +03:00
} else {
2006-09-22 19:04:04 +04:00
/* don't terminate me just yet */
nabort_procs - - ;
2004-11-01 19:16:05 +03:00
}
}
}
2006-09-22 19:04:04 +04:00
if ( nabort_procs > 0 ) {
2011-06-15 17:10:13 +04:00
/* This must be implemented for MPI_Abort() to work according to the
* standard language for a ' high - quality ' implementation .
* It would be nifty if we could differentiate between the
* abort scenarios :
* - MPI_Abort ( )
* - MPI_ERRORS_ARE_FATAL
* - Victim of MPI_Abort ( )
*/
/*
* Abort peers in this communicator group . Does not include self .
*/
2011-10-04 18:50:31 +04:00
if ( ORTE_SUCCESS ! = ( ret = orte_errmgr . abort_peers ( abort_procs , nabort_procs ) ) ) {
2011-06-15 17:10:13 +04:00
orte_errmgr . abort ( ret , " Open MPI failed to abort all of the procs requested (%d). " , ret ) ;
2006-09-22 19:04:04 +04:00
}
}
/* now that we've aborted everyone else, gracefully die. */
2008-02-28 04:57:57 +03:00
orte_errmgr . abort ( errcode , NULL ) ;
2006-09-15 01:29:51 +04:00
2004-11-01 19:16:05 +03:00
return OMPI_SUCCESS ;
}