2015-06-23 08:45:22 -06:00
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2004-11-01 16:16:05 +00:00
/*
2005-11-05 19:57:48 +00:00
* Copyright ( c ) 2004 - 2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation . All rights reserved .
2014-07-03 02:38:27 +00:00
* Copyright ( c ) 2004 - 2014 The University of Tennessee and The University
2005-11-05 19:57:48 +00:00
* of Tennessee Research Foundation . All rights
* reserved .
2015-06-23 20:59:57 -07:00
* Copyright ( c ) 2004 - 2005 High Performance Computing Center Stuttgart ,
2004-11-28 20:09:25 +00:00
* University of Stuttgart . All rights reserved .
2005-03-24 12:43:37 +00:00
* Copyright ( c ) 2004 - 2005 The Regents of the University of California .
* All rights reserved .
2016-05-05 14:10:17 -07:00
* Copyright ( c ) 2006 - 2016 Cisco Systems , Inc . All rights reserved .
2012-06-27 01:28:28 +00:00
* Copyright ( c ) 2010 - 2011 Oak Ridge National Labs . All rights reserved .
2014-08-04 02:52:56 +00:00
* Copyright ( c ) 2014 Research Organization for Information Science
* and Technology ( RIST ) . All rights reserved .
2015-06-23 08:45:22 -06:00
* Copyright ( c ) 2015 Los Alamos National Security , LLC . All rights
* reserved .
2015-11-25 15:22:52 +03:00
* Copyright ( c ) 2015 Mellanox Technologies , Inc .
* All rights reserved .
2004-11-22 01:38:40 +00:00
* $ COPYRIGHT $
2015-06-23 20:59:57 -07:00
*
2004-11-22 01:38:40 +00:00
* Additional copyrights may follow
2015-06-23 20:59:57 -07:00
*
2004-11-01 16:16:05 +00:00
* $ HEADER $
*/
# include "ompi_config.h"
2005-01-20 00:03:23 +00:00
# ifdef HAVE_UNISTD_H
2004-11-05 07:52:30 +00:00
# include <unistd.h>
2005-01-20 00:03:23 +00:00
# endif
2006-03-31 00:31:15 +00:00
# ifdef HAVE_SYS_TYPES_H
# include <sys/types.h>
# endif
# ifdef HAVE_SYS_PARAM_H
# include <sys/param.h>
# endif
# ifdef HAVE_NETDB_H
# include <netdb.h>
# endif
2014-07-03 02:38:27 +00:00
# include <errno.h>
2004-11-05 07:52:30 +00:00
2006-09-22 15:04:04 +00:00
# include "opal/mca/backtrace/backtrace.h"
2015-11-25 15:22:52 +03:00
# include "opal/runtime/opal_params.h"
2013-01-27 23:25:10 +00:00
2006-03-31 00:31:15 +00:00
# include "ompi/communicator/communicator.h"
# include "ompi/runtime/mpiruntime.h"
# include "ompi/runtime/params.h"
2008-09-20 11:34:37 +00:00
# include "ompi/debuggers/debuggers.h"
# include "ompi/errhandler/errcode.h"
2004-11-01 16:16:05 +00:00
2007-01-29 22:01:28 +00:00
static bool have_been_invoked = false ;
2004-12-14 15:47:31 +00:00
2014-07-03 02:38:27 +00:00
/*
* Local helper function to build an array of all the procs in a
* communicator , excluding this process .
*
* Killing a just the indicated peers must be implemented for
* MPI_Abort ( ) to work according to the standard language for
* a ' high - quality ' implementation .
*
* It would be nifty if we could differentiate between the
* abort scenarios ( but we don ' t , currently ) :
* - MPI_Abort ( )
* - MPI_ERRORS_ARE_FATAL
* - Victim of MPI_Abort ( )
*/
static void try_kill_peers ( ompi_communicator_t * comm ,
int errcode )
{
int nprocs ;
ompi_process_name_t * procs ;
nprocs = ompi_comm_size ( comm ) ;
/* ompi_comm_remote_size() returns 0 if not an intercomm, so
this is safe */
nprocs + = ompi_comm_remote_size ( comm ) ;
procs = ( ompi_process_name_t * ) calloc ( nprocs , sizeof ( ompi_process_name_t ) ) ;
if ( NULL = = procs ) {
/* quick clean orte and get out */
ompi_rte_abort ( errno , " Abort: unable to alloc memory to kill procs " ) ;
}
/* put all the local group procs in the abort list */
int rank , i , count ;
rank = ompi_comm_rank ( comm ) ;
for ( count = i = 0 ; i < ompi_comm_size ( comm ) ; + + i ) {
if ( rank = = i ) {
/* Don't include this process in the array */
- - nprocs ;
} else {
assert ( count < = nprocs ) ;
procs [ count + + ] =
2015-06-23 08:45:22 -06:00
* OMPI_CAST_RTE_NAME ( & ompi_group_get_proc_ptr ( comm - > c_remote_group , i , true ) - > super . proc_name ) ;
2014-07-03 02:38:27 +00:00
}
}
/* if requested, kill off remote group procs too */
for ( i = 0 ; i < ompi_comm_remote_size ( comm ) ; + + i ) {
assert ( count < = nprocs ) ;
procs [ count + + ] =
2015-06-23 08:45:22 -06:00
* OMPI_CAST_RTE_NAME ( & ompi_group_get_proc_ptr ( comm - > c_remote_group , i , true ) - > super . proc_name ) ;
2014-07-03 02:38:27 +00:00
}
if ( nprocs > 0 ) {
ompi_rte_abort_peers ( procs , nprocs , errcode ) ;
}
/* We could fall through here if ompi_rte_abort_peers() fails, or
if ( nprocs = = 0 ) . Either way , tidy up and let the caller
handle it . */
free ( procs ) ;
}
2004-11-01 16:16:05 +00:00
int
ompi_mpi_abort ( struct ompi_communicator_t * comm ,
2014-07-03 00:34:44 +00:00
int errcode )
2004-11-01 16:16:05 +00:00
{
2016-05-05 14:10:17 -07:00
char * host , hostname [ OPAL_MAXHOSTNAMELEN ] ;
2006-04-01 12:41:48 +00:00
pid_t pid = 0 ;
2007-01-29 22:01:28 +00:00
/* Protection for recursive invocation */
if ( have_been_invoked ) {
return OMPI_SUCCESS ;
}
have_been_invoked = true ;
2014-07-03 02:38:27 +00:00
/* If MPI is initialized, we know we have a runtime nodename, so
use that . Otherwise , call gethostname . */
if ( ompi_rte_initialized ) {
2013-01-27 23:25:10 +00:00
host = ompi_process_info . nodename ;
2007-01-29 22:01:28 +00:00
} else {
2011-11-29 23:24:52 +00:00
gethostname ( hostname , sizeof ( hostname ) ) ;
2007-01-29 22:01:28 +00:00
host = hostname ;
}
pid = getpid ( ) ;
2011-03-07 16:45:45 +00:00
/* Should we print a stack trace? Not aggregated because they
might be different on all processes . */
2015-11-25 15:22:52 +03:00
if ( opal_abort_print_stack ) {
2006-09-22 15:04:04 +00:00
char * * messages ;
int len , i ;
2015-11-25 15:22:52 +03:00
if ( OPAL_SUCCESS = = opal_backtrace_buffer ( & messages , & len ) ) {
2006-09-22 15:04:04 +00:00
for ( i = 0 ; i < len ; + + i ) {
2015-06-23 20:59:57 -07:00
fprintf ( stderr , " [%s:%d] [%d] func:%s \n " , host , ( int ) pid ,
2006-09-22 15:04:04 +00:00
i , messages [ i ] ) ;
fflush ( stderr ) ;
}
free ( messages ) ;
} else {
2007-01-04 22:30:28 +00:00
/* This will print an message if it's unable to print the
backtrace , so we don ' t need an additional " else " clause
if opal_backtrace_print ( ) is not supported . */
2013-12-18 17:57:37 +00:00
opal_backtrace_print ( stderr , NULL , 1 ) ;
2006-03-31 00:31:15 +00:00
}
}
/* Should we wait for a while before aborting? */
2015-11-25 15:22:52 +03:00
if ( 0 ! = opal_abort_delay ) {
if ( opal_abort_delay < 0 ) {
fprintf ( stderr , " [%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0) \n " ,
2007-01-29 22:01:28 +00:00
host , ( int ) pid ) ;
2006-03-31 00:31:15 +00:00
fflush ( stderr ) ;
2015-06-23 20:59:57 -07:00
while ( 1 ) {
sleep ( 5 ) ;
2006-03-31 00:31:15 +00:00
}
} else {
2007-01-04 22:30:28 +00:00
fprintf ( stderr , " [%s:%d] Delaying for %d seconds before aborting \n " ,
2015-11-25 15:22:52 +03:00
host , ( int ) pid , opal_abort_delay ) ;
2006-03-31 00:31:15 +00:00
do {
sleep ( 1 ) ;
2015-11-25 15:22:52 +03:00
} while ( - - opal_abort_delay > 0 ) ;
2006-03-31 00:31:15 +00:00
}
}
2014-07-03 02:38:27 +00:00
/* If the RTE isn't setup yet/any more, then don't even try
killing everyone . Sorry , Charlie . . . */
if ( ! ompi_rte_initialized ) {
fprintf ( stderr , " [%s:%d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed! \n " ,
2015-06-23 20:59:57 -07:00
host , ( int ) pid , ompi_mpi_finalized ?
2014-07-03 02:38:27 +00:00
" after MPI_FINALIZE started " : " before MPI_INIT completed " ) ;
2015-08-27 17:05:58 -07:00
_exit ( errcode = = 0 ? 1 : errcode ) ;
2007-01-29 22:01:28 +00:00
}
2014-07-03 02:38:27 +00:00
/* If OMPI is initialized and we have a non-NULL communicator,
then try to kill just that set of processes */
if ( ompi_mpi_initialized & & ! ompi_mpi_finalized & & NULL ! = comm ) {
try_kill_peers ( comm , errcode ) ;
2006-09-22 15:04:04 +00:00
}
2014-07-03 02:38:27 +00:00
/* We can fall through to here in a few cases:
2006-09-22 15:04:04 +00:00
2014-07-03 02:38:27 +00:00
1. The attempt to kill just a subset of peers via
try_kill_peers ( ) failed ( e . g . , as of July 2014 , ORTE does
returns NOT_IMPLENTED from orte_rte_abort_peers ( ) ) .
2. MPI wasn ' t initialized , was already finalized , or we got a
NULL communicator .
2006-09-22 15:04:04 +00:00
2014-07-03 02:38:27 +00:00
In all of these cases , the only sensible thing left to do is to
kill the entire job . Wah wah . */
2013-01-27 23:25:10 +00:00
ompi_rte_abort ( errcode , NULL ) ;
2014-07-03 02:38:27 +00:00
/* Does not return */
2004-11-01 16:16:05 +00:00
}