2004-11-01 19:16:05 +03:00
/*
2005-11-05 22:57:48 +03:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
2014-07-03 06:38:27 +04:00
* Copyright ( c ) 2004 - 2014 The University of Tennessee and The University
2005-11-05 22:57:48 +03:00
* of Tennessee Research Foundation . All rights
* reserved .
2004-11-28 23:09:25 +03:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
* University of Stuttgart . All rights reserved .
2005-03-24 15:43:37 +03:00
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2014-07-03 06:38:27 +04:00
* Copyright ( c ) 2006 - 2014 Cisco Systems , Inc . All rights reserved .
2012-06-27 05:28:28 +04:00
* Copyright ( c ) 2010 - 2011 Oak Ridge National Labs . All rights reserved .
2014-08-04 06:52:56 +04:00
* Copyright ( c ) 2014 Research Organization for Information Science
* and Technology ( RIST ) . All rights reserved .
2004-11-22 04:38:40 +03:00
* $ COPYRIGHT $
*
* Additional copyrights may follow
*
2004-11-01 19:16:05 +03:00
* $ HEADER $
*/
# include "ompi_config.h"
2005-01-20 03:03:23 +03:00
# ifdef HAVE_UNISTD_H
2004-11-05 10:52:30 +03:00
# include <unistd.h>
2005-01-20 03:03:23 +03:00
# endif
2006-03-31 04:31:15 +04:00
# ifdef HAVE_SYS_TYPES_H
# include <sys/types.h>
# endif
# ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
# endif
# ifdef HAVE_NETDB_H
# include <netdb.h>
# endif
2014-07-03 06:38:27 +04:00
# include <errno.h>
2004-11-05 10:52:30 +03:00
2006-09-22 19:04:04 +04:00
# include "opal/mca/backtrace/backtrace.h"
2013-01-28 03:25:10 +04:00
2006-03-31 04:31:15 +04:00
# include "ompi/communicator/communicator.h"
# include "ompi/runtime/mpiruntime.h"
# include "ompi/runtime/params.h"
2008-09-20 15:34:37 +04:00
# include "ompi/debuggers/debuggers.h"
# include "ompi/errhandler/errcode.h"
2004-11-01 19:16:05 +03:00
2007-01-30 01:01:28 +03:00
static bool have_been_invoked = false ;
2004-12-14 18:47:31 +03:00
2014-07-03 06:38:27 +04:00
/*
* Local helper function to build an array of all the procs in a
* communicator , excluding this process .
*
* Killing a just the indicated peers must be implemented for
* MPI_Abort ( ) to work according to the standard language for
* a ' high - quality ' implementation .
*
* It would be nifty if we could differentiate between the
* abort scenarios ( but we don ' t , currently ) :
* - MPI_Abort ( )
* - MPI_ERRORS_ARE_FATAL
* - Victim of MPI_Abort ( )
*/
static void try_kill_peers ( ompi_communicator_t * comm ,
int errcode )
{
int nprocs ;
ompi_process_name_t * procs ;
nprocs = ompi_comm_size ( comm ) ;
/* ompi_comm_remote_size() returns 0 if not an intercomm, so
this is safe */
nprocs + = ompi_comm_remote_size ( comm ) ;
procs = ( ompi_process_name_t * ) calloc ( nprocs , sizeof ( ompi_process_name_t ) ) ;
if ( NULL = = procs ) {
/* quick clean orte and get out */
ompi_rte_abort ( errno , " Abort: unable to alloc memory to kill procs " ) ;
}
/* put all the local group procs in the abort list */
int rank , i , count ;
rank = ompi_comm_rank ( comm ) ;
for ( count = i = 0 ; i < ompi_comm_size ( comm ) ; + + i ) {
if ( rank = = i ) {
/* Don't include this process in the array */
- - nprocs ;
} else {
assert ( count < = nprocs ) ;
procs [ count + + ] =
2014-08-04 06:52:56 +04:00
* OMPI_CAST_RTE_NAME ( & ompi_group_get_proc_ptr ( comm - > c_remote_group , i ) - > super . proc_name ) ;
2014-07-03 06:38:27 +04:00
}
}
/* if requested, kill off remote group procs too */
for ( i = 0 ; i < ompi_comm_remote_size ( comm ) ; + + i ) {
assert ( count < = nprocs ) ;
procs [ count + + ] =
2014-08-04 06:52:56 +04:00
* OMPI_CAST_RTE_NAME ( & ompi_group_get_proc_ptr ( comm - > c_remote_group , i ) - > super . proc_name ) ;
2014-07-03 06:38:27 +04:00
}
if ( nprocs > 0 ) {
ompi_rte_abort_peers ( procs , nprocs , errcode ) ;
}
/* We could fall through here if ompi_rte_abort_peers() fails, or
if ( nprocs = = 0 ) . Either way , tidy up and let the caller
handle it . */
free ( procs ) ;
}
2004-11-01 19:16:05 +03:00
int
ompi_mpi_abort ( struct ompi_communicator_t * comm ,
2014-07-03 04:34:44 +04:00
int errcode )
2004-11-01 19:16:05 +03:00
{
2008-09-20 15:34:37 +04:00
char * msg , * host , hostname [ MAXHOSTNAMELEN ] ;
2006-04-01 16:41:48 +04:00
pid_t pid = 0 ;
2007-01-30 01:01:28 +03:00
/* Protection for recursive invocation */
if ( have_been_invoked ) {
return OMPI_SUCCESS ;
}
have_been_invoked = true ;
2014-07-03 06:38:27 +04:00
/* If MPI is initialized, we know we have a runtime nodename, so
use that . Otherwise , call gethostname . */
if ( ompi_rte_initialized ) {
2013-01-28 03:25:10 +04:00
host = ompi_process_info . nodename ;
2007-01-30 01:01:28 +03:00
} else {
2011-11-30 03:24:52 +04:00
gethostname ( hostname , sizeof ( hostname ) ) ;
2007-01-30 01:01:28 +03:00
host = hostname ;
}
pid = getpid ( ) ;
2011-03-07 19:45:45 +03:00
/* Should we print a stack trace? Not aggregated because they
might be different on all processes . */
2006-03-31 04:31:15 +04:00
if ( ompi_mpi_abort_print_stack ) {
2006-09-22 19:04:04 +04:00
char * * messages ;
int len , i ;
if ( OMPI_SUCCESS = = opal_backtrace_buffer ( & messages , & len ) ) {
for ( i = 0 ; i < len ; + + i ) {
2007-01-30 01:01:28 +03:00
fprintf ( stderr , " [%s:%d] [%d] func:%s \n " , host , ( int ) pid ,
2006-09-22 19:04:04 +04:00
i , messages [ i ] ) ;
fflush ( stderr ) ;
}
free ( messages ) ;
} else {
2007-01-05 01:30:28 +03:00
/* This will print an message if it's unable to print the
backtrace , so we don ' t need an additional " else " clause
if opal_backtrace_print ( ) is not supported . */
2013-12-18 21:57:37 +04:00
opal_backtrace_print ( stderr , NULL , 1 ) ;
2006-03-31 04:31:15 +04:00
}
}
2008-09-20 15:34:37 +04:00
/* Notify the debugger that we're about to abort */
2008-10-02 01:42:08 +04:00
if ( errcode < 0 | |
asprintf ( & msg , " [%s:%d] aborting with MPI error %s%s " ,
2008-09-20 15:34:37 +04:00
host , ( int ) pid , ompi_mpi_errnum_get_string ( errcode ) ,
ompi_mpi_abort_print_stack ?
" (stack trace available on stderr) " : " " ) < 0 ) {
msg = NULL ;
}
ompi_debugger_notify_abort ( msg ) ;
if ( NULL ! = msg ) {
free ( msg ) ;
}
2006-03-31 04:31:15 +04:00
/* Should we wait for a while before aborting? */
if ( 0 ! = ompi_mpi_abort_delay ) {
if ( ompi_mpi_abort_delay < 0 ) {
2007-01-05 01:30:28 +03:00
fprintf ( stderr , " [%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0) \n " ,
2007-01-30 01:01:28 +03:00
host , ( int ) pid ) ;
2006-03-31 04:31:15 +04:00
fflush ( stderr ) ;
while ( 1 ) {
sleep ( 5 ) ;
}
} else {
2007-01-05 01:30:28 +03:00
fprintf ( stderr , " [%s:%d] Delaying for %d seconds before aborting \n " ,
2007-01-30 01:01:28 +03:00
host , ( int ) pid , ompi_mpi_abort_delay ) ;
2006-03-31 04:31:15 +04:00
do {
sleep ( 1 ) ;
} while ( - - ompi_mpi_abort_delay > 0 ) ;
}
}
2014-07-03 06:38:27 +04:00
/* If the RTE isn't setup yet/any more, then don't even try
killing everyone . Sorry , Charlie . . . */
if ( ! ompi_rte_initialized ) {
fprintf ( stderr , " [%s:%d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed! \n " ,
2013-01-28 03:25:10 +04:00
host , ( int ) pid , ompi_mpi_finalized ?
2014-07-03 06:38:27 +04:00
" after MPI_FINALIZE started " : " before MPI_INIT completed " ) ;
exit ( errcode = = 0 ? 1 : errcode ) ;
2007-01-30 01:01:28 +03:00
}
2014-07-03 06:38:27 +04:00
/* If OMPI is initialized and we have a non-NULL communicator,
then try to kill just that set of processes */
if ( ompi_mpi_initialized & & ! ompi_mpi_finalized & & NULL ! = comm ) {
try_kill_peers ( comm , errcode ) ;
2006-09-22 19:04:04 +04:00
}
2014-07-03 06:38:27 +04:00
/* We can fall through to here in a few cases:
2006-09-22 19:04:04 +04:00
2014-07-03 06:38:27 +04:00
1. The attempt to kill just a subset of peers via
try_kill_peers ( ) failed ( e . g . , as of July 2014 , ORTE does
returns NOT_IMPLENTED from orte_rte_abort_peers ( ) ) .
2. MPI wasn ' t initialized , was already finalized , or we got a
NULL communicator .
2006-09-22 19:04:04 +04:00
2014-07-03 06:38:27 +04:00
In all of these cases , the only sensible thing left to do is to
kill the entire job . Wah wah . */
2013-01-28 03:25:10 +04:00
ompi_rte_abort ( errcode , NULL ) ;
2014-07-03 06:38:27 +04:00
/* Does not return */
2004-11-01 19:16:05 +03:00
}