1
1

Merge pull request #7800 from abouteiller/mpi-next/errors_abort

MPI4: Add ERRORS_ABORT infrastructure
Этот коммит содержится в:
Austen Lauria 2020-06-29 15:45:29 -04:00 коммит произвёл GitHub
родитель a26e494953 e2f53b76fb
Коммит 3ed466e629
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
21 изменённых файлов: 360 добавлений и 49 удалений

Просмотреть файл

@ -3,7 +3,7 @@
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# Copyright (c) 2004-2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -186,7 +186,7 @@ SEARCH_HEADER[5]="ompi/datatype/convertor.h OMPI_COMM_INTER OMPI_COMM_CART OMPI_
SEARCH_HEADER[6]="ompi/datatype/datatype.h MPI_Datatype DT_MAX_PREDEFINED DT_FLAG_ MAX_DT_COMPONENT_COUNT opal_ddt_count_t dt_type_desc_t ompi_datatype_t ompi_predefined_datatype_t ompi_ddt_init ompi_ddt_finalize ompi_ddt_create_ ompi_ddt_duplicate ompi_ddt_is_predefined ompi_ddt_create_from_packed_description"
SEARCH_HEADER[7]="ompi/datatype/datatype_internal.h DDT_DUMP_STACK DT_ ddt_elem_id_description ddt_elem_desc ddt_elem_desc_t ddt_loop_desc ddt_loop_desc_t ddt_endloop_desc ddt_endloop_desc_t dt_elem_desc CREATE_LOOP_START CREATE_LOOP_END CREATE_ELEM ompi_complex_float_t ompi_complex_double_t ompi_complex_long_double_t ompi_ddt_basicDatatypes BASIC_DDT_FROM_ELEM ompi_ddt_default_convertors_init ompi_ddt_default_convertors_fini SAVE_STACK PUSH_STACK ompi_ddt_safeguard_pointer_debug_breakpoint OMPI_DDT_SAFEGUARD_POINTER GET_FIRST_NON_LOOP UPDATE_INTERNAL_COUNTERS ompi_ddt_print_args"
SEARCH_HEADER[8]="ompi/errhandler/errhandler.h OMPI_ERRHANDLER_LANG_ ompi_errhandler_lang_t OMPI_ERRHANDLER_TYPE_ ompi_errhandler_type_t ompi_errhandler_t ompi_predefined_errhandler_t ompi_mpi_errhandler_null OMPI_ERRHANDLER_CHECK OMPI_ERRHANDLER_RETURN ompi_errhandler_init ompi_errhandler_finalize OMPI_ERRHANDLER_INVOKE ompi_errhandler_invoke ompi_errhandler_request_invoke ompi_errhandler_create ompi_errhandler_is_intrinsic ompi_errhandler_fortran_handler_fn_t OMPI_ERR_INIT_FINALIZE MPI_Errhandler"
SEARCH_HEADER[9]="ompi/errhandler/errhandler_predefined.h ompi_mpi_errors_are_fatal_ ompi_mpi_errors_return_ ompi_mpi_errors_throw_exceptions"
SEARCH_HEADER[9]="ompi/errhandler/errhandler_predefined.h ompi_mpi_errors_are_fatal_ ompi_mpi_errors_return_ ompi_mpi_errors_abort_ ompi_mpi_errors_throw_exceptions"
###
SEARCH_HEADER[10]="ompi/file/file.h OMPI_FILE_ISCLOSED OMPI_FILE_HIDDEN ompi_file_t ompi_predefined_file_t ompi_mpi_file_null ompi_file_f_to_c_table ompi_file_init ompi_file_open ompi_file_set_name ompi_file_close ompi_file_finalize ompi_file_invalid MPI_File MPI_FILE_NULL ompi_mpi_cxx_file_errhandler_invoke" # THE LAST ONE WAS FOR THE CXX INTERFACE
SEARCH_HEADER[11]="ompi/group/group.h ompi_group_sporadic_list_t ompi_group_sporadic_data_t ompi_group_strided_data_t ompi_group_bitmap_data_t ompi_group_t ompi_predefined_group_t OMPI_GROUP_ ompi_group_f_to_c_table ompi_mpi_group_null ompi_group_allocate ompi_group_increment_proc_count ompi_group_decrement_proc_count ompi_group_size ompi_group_rank ompi_set_group_rank ompi_group_translate_ranks ompi_group_free ompi_group_get_proc_ptr ompi_group_calc_ ompi_group_peer_lookup ompi_group_div_ceil MPI_Group"

Просмотреть файл

@ -314,8 +314,8 @@ C++: MPI::Errhandler
MPI allows applications to define their own error handlers. The
default error handler is to abort the MPI job. Error handlers can be
attached to communicators, files, and windows. There are 3 predefined
error handlers (MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN,
attached to communicators, files, and windows. There are 4 predefined
error handlers (MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN, MPI_ERRORS_ABORT,
MPI::ERRORS_THROW_EXCEPTIONS), and applications can create their own
error handlers.

Просмотреть файл

@ -1,6 +1,6 @@
/*
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
@ -384,6 +384,8 @@ int mpidbg_init_per_process(mqs_process *process,
int i = 0;
fill_map(image, "MPI_ERRORS_ARE_FATAL", "ompi_mpi_errors_are_fatal",
&mpidbg_errhandler_name_map[i++]);
fill_map(image, "MPI_ERRORS_ABORT", "ompi_mpi_errors_abort",
&mpidbg_errhandler_name_map[i++]);
fill_map(image, "MPI_ERRORS_RETURN", "ompi_mpi_errors_return",
&mpidbg_errhandler_name_map[i++]);
fill_map(image, "MPI_ERRHANDLER_NULL", "ompi_mpi_errhandler_null",

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -85,6 +85,7 @@ static ompi_mpi_errcode_t ompi_err_not_same;
static ompi_mpi_errcode_t ompi_err_no_space;
static ompi_mpi_errcode_t ompi_err_no_such_file;
static ompi_mpi_errcode_t ompi_err_port;
static ompi_mpi_errcode_t ompi_err_proc_aborted;
static ompi_mpi_errcode_t ompi_err_quota;
static ompi_mpi_errcode_t ompi_err_read_only;
static ompi_mpi_errcode_t ompi_err_rma_conflict;
@ -186,6 +187,7 @@ int ompi_mpi_errcode_init (void)
CONSTRUCT_ERRCODE( ompi_err_no_space, MPI_ERR_NO_SPACE, "MPI_ERR_NO_SPACE: no space left on device" );
CONSTRUCT_ERRCODE( ompi_err_no_such_file, MPI_ERR_NO_SUCH_FILE, "MPI_ERR_NO_SUCH_FILE: no such file or directory" );
CONSTRUCT_ERRCODE( ompi_err_port, MPI_ERR_PORT, "MPI_ERR_PORT: invalid port" );
CONSTRUCT_ERRCODE( ompi_err_proc_aborted, MPI_ERR_PROC_ABORTED, "MPI_ERR_PROC_ABORTED: operation failed because a remote peer has aborted" );
CONSTRUCT_ERRCODE( ompi_err_quota, MPI_ERR_QUOTA, "MPI_ERR_QUOTA: out of quota" );
CONSTRUCT_ERRCODE( ompi_err_read_only, MPI_ERR_READ_ONLY, "MPI_ERR_READ_ONLY: file is read only" );
CONSTRUCT_ERRCODE( ompi_err_rma_conflict, MPI_ERR_RMA_CONFLICT, "MPI_ERR_RMA_CONFLICT: rma conflict during operation" );
@ -282,6 +284,7 @@ int ompi_mpi_errcode_finalize(void)
OBJ_DESTRUCT(&ompi_err_no_space);
OBJ_DESTRUCT(&ompi_err_no_such_file);
OBJ_DESTRUCT(&ompi_err_port);
OBJ_DESTRUCT(&ompi_err_proc_aborted);
OBJ_DESTRUCT(&ompi_err_quota);
OBJ_DESTRUCT(&ompi_err_read_only);
OBJ_DESTRUCT(&ompi_err_rma_conflict);

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -68,6 +68,9 @@ ompi_predefined_errhandler_t *ompi_mpi_errhandler_null_addr =
ompi_predefined_errhandler_t ompi_mpi_errors_are_fatal = {{{0}}};
ompi_predefined_errhandler_t *ompi_mpi_errors_are_fatal_addr =
&ompi_mpi_errors_are_fatal;
ompi_predefined_errhandler_t ompi_mpi_errors_abort = {{{0}}};
ompi_predefined_errhandler_t *ompi_mpi_errors_abort_addr =
&ompi_mpi_errors_abort;
ompi_predefined_errhandler_t ompi_mpi_errors_return = {{{0}}};
ompi_predefined_errhandler_t *ompi_mpi_errors_return_addr =
&ompi_mpi_errors_return;
@ -127,6 +130,19 @@ int ompi_errhandler_init(void)
opal_string_copy(ompi_mpi_errors_return.eh.eh_name, "MPI_ERRORS_RETURN",
sizeof(ompi_mpi_errors_return.eh.eh_name));
OBJ_CONSTRUCT( &ompi_mpi_errors_abort.eh, ompi_errhandler_t );
if( ompi_mpi_errors_abort.eh.eh_f_to_c_index != OMPI_ERRORS_ABORT_FORTRAN )
return OMPI_ERROR;
ompi_mpi_errors_abort.eh.eh_mpi_object_type = OMPI_ERRHANDLER_TYPE_PREDEFINED;
ompi_mpi_errors_abort.eh.eh_lang = OMPI_ERRHANDLER_LANG_C;
ompi_mpi_errors_abort.eh.eh_comm_fn = ompi_mpi_errors_abort_comm_handler;
ompi_mpi_errors_abort.eh.eh_file_fn = ompi_mpi_errors_abort_file_handler;
ompi_mpi_errors_abort.eh.eh_win_fn = ompi_mpi_errors_abort_win_handler ;
ompi_mpi_errors_abort.eh.eh_fort_fn = NULL;
opal_string_copy(ompi_mpi_errors_abort.eh.eh_name,
"MPI_ERRORS_ABORT",
sizeof(ompi_mpi_errors_abort.eh.eh_name));
/* If we're going to use C++, functions will be fixed up during
MPI::Init. Note that it is proper to use ERRHANDLER_LANG_C here;
the dispatch function is in C (although in libmpi_cxx); the

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -49,7 +49,8 @@ BEGIN_C_DECLS
enum {
OMPI_ERRHANDLER_NULL_FORTRAN = 0,
OMPI_ERRORS_ARE_FATAL_FORTRAN,
OMPI_ERRORS_RETURN_FORTRAN
OMPI_ERRORS_RETURN_FORTRAN,
OMPI_ERRORS_ABORT_FORTRAN,
};
@ -167,6 +168,12 @@ OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_are_fatal_add
OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_return;
OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_return_addr;
/*
* Global variable for MPI_ERRORS_ABORT (_addr flavor is for F03 bindings)
*/
OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_abort;
OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_abort_addr;
/**
* Global variable for MPI::ERRORS_THROW_EXCEPTIONS. Will abort if
* MPI_INIT wasn't called as MPI::INIT (_addr flavor is for F03 bindings)

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -41,10 +41,10 @@ int ompi_errhandler_invoke(ompi_errhandler_t *errhandler, void *mpi_object,
ompi_win_t *win;
ompi_file_t *file;
/* If we got no errorhandler, then just invoke errors_abort */
/* If we got no errorhandler, then just invoke errors_are_fatal */
if (NULL == errhandler) {
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, message);
return err_code;
return err_code;
}
/* Figure out what kind of errhandler it is, figure out if it's

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -47,7 +47,7 @@
/*
* Local functions
*/
static void backend_fatal(char *type, struct ompi_communicator_t *comm,
static void backend_abort(int fatal, char *type, struct ompi_communicator_t *comm,
char *name, int *error_code, va_list arglist);
static void out(char *str, char *arg);
@ -68,7 +68,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
name = NULL;
abort_comm = NULL;
}
backend_fatal("communicator", abort_comm, name, error_code, arglist);
backend_abort(true, "communicator", abort_comm, name, error_code, arglist);
va_end(arglist);
}
@ -89,7 +89,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
name = NULL;
abort_comm = NULL;
}
backend_fatal("file", abort_comm, name, error_code, arglist);
backend_abort(true, "file", abort_comm, name, error_code, arglist);
va_end(arglist);
}
@ -108,7 +108,67 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
} else {
name = NULL;
}
backend_fatal("win", abort_comm, name, error_code, arglist);
backend_abort(true, "win", abort_comm, name, error_code, arglist);
va_end(arglist);
}
void ompi_mpi_errors_abort_comm_handler(struct ompi_communicator_t **comm,
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm;
va_list arglist;
va_start(arglist, error_code);
if ( (NULL != comm) && (NULL != *comm) ) {
name = (*comm)->c_name;
abort_comm = *comm;
} else {
name = NULL;
abort_comm = NULL;
}
backend_abort(false, "communicator", abort_comm, name, error_code, arglist);
va_end(arglist);
}
void ompi_mpi_errors_abort_file_handler(struct ompi_file_t **file,
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm;
va_list arglist;
va_start(arglist, error_code);
if (NULL != file) {
name = (*file)->f_filename;
abort_comm = (*file)->f_comm;
} else {
name = NULL;
abort_comm = NULL;
}
backend_abort(false, "file", abort_comm, name, error_code, arglist);
va_end(arglist);
}
void ompi_mpi_errors_abort_win_handler(struct ompi_win_t **win,
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm = NULL;
va_list arglist;
va_start(arglist, error_code);
if (NULL != win) {
name = (*win)->w_name;
} else {
name = NULL;
}
backend_abort(false, "win", abort_comm, name, error_code, arglist);
va_end(arglist);
}
@ -175,7 +235,7 @@ static void out(char *str, char *arg)
* there's no need to handle the pre-MPI_INIT and post-MPI_FINALIZE
* errors here.
*/
static void backend_fatal_aggregate(char *type,
static void backend_abort_aggregate(int fatal, char *type,
struct ompi_communicator_t *comm,
char *name, int *error_code,
va_list arglist)
@ -199,7 +259,7 @@ static void backend_fatal_aggregate(char *type,
ompi_process_info.nodename,
(int) ompi_process_info.pid) == -1) {
prefix = NULL;
// non-fatal, we could still go on to give useful information here...
// non-abort, we could still go on to give useful information here...
opal_output(0, "%s", "Could not write node and PID to prefix");
opal_output(0, "Node: %s", ompi_process_info.nodename);
opal_output(0, "PID: %d", (int) ompi_process_info.pid);
@ -224,7 +284,7 @@ static void backend_fatal_aggregate(char *type,
if (NULL != name) {
opal_show_help("help-mpi-errors.txt",
"mpi_errors_are_fatal",
fatal? "mpi_errors_are_fatal": "mpi_errors_abort",
false,
usable_prefix,
(NULL == arg) ? "" : "in",
@ -267,7 +327,7 @@ static void backend_fatal_aggregate(char *type,
/*
* Note that this function has to handle pre-MPI_INIT and
* post-MPI_FINALIZE errors, which backend_fatal_aggregate() does not
* post-MPI_FINALIZE errors, which backend_abort_aggregate() does not
* have to handle.
*
* This function also intentionally does not call malloc(), just in
@ -275,7 +335,7 @@ static void backend_fatal_aggregate(char *type,
* we *might* be able to get a message out if we're not further
* corrupting the stack by calling malloc()...
*/
static void backend_fatal_no_aggregate(char *type,
static void backend_abort_no_aggregate(int fatal, char *type,
struct ompi_communicator_t *comm,
char *name, int *error_code,
va_list arglist)
@ -303,7 +363,7 @@ static void backend_fatal_no_aggregate(char *type,
"*** Unfortunately, no further information is available on *which* MPI\n"
"*** function was invoked, sorry. :-(\n", NULL);
}
out("*** Your MPI job will now abort.\n", NULL);
if(fatal) out("*** Your MPI job will now abort.\n", NULL);
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
if (NULL != arg) {
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
@ -314,7 +374,7 @@ static void backend_fatal_no_aggregate(char *type,
"*** Unfortunately, no further information is available on *which* MPI\n"
"*** function was invoked, sorry. :-(\n", NULL);
}
out("*** Your MPI job will now abort.\n", NULL);
if(fatal) out("*** Your MPI job will now abort.\n", NULL);
}
else {
@ -365,23 +425,30 @@ static void backend_fatal_no_aggregate(char *type,
out("*** Error code: %d (no associated error message)\n", intbuf);
}
}
/* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
out("*** and potentially your MPI job)\n", NULL);
/* out("*** MPI_ERRORS_ABORT: your MPI job will now abort\n", NULL); */
if(fatal) {
out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
out("*** and MPI will try to terminate your MPI job as well)\n", NULL);
}
else {
out("*** MPI_ERRORS_ABORT (processes in this %s will now abort,\n", type);
out("*** and potentially the rest of your MPI job)\n", NULL);
}
}
va_end(arglist);
}
static void backend_fatal(char *type, struct ompi_communicator_t *comm,
static void backend_abort(int fatal, char *type, struct ompi_communicator_t *comm,
char *name, int *error_code,
va_list arglist)
{
int err = MPI_ERR_UNKNOWN;
/* We only want aggregation while the rte is initialized */
if (ompi_rte_initialized) {
backend_fatal_aggregate(type, comm, name, error_code, arglist);
backend_abort_aggregate(fatal, type, comm, name, error_code, arglist);
} else {
backend_fatal_no_aggregate(type, comm, name, error_code, arglist);
backend_abort_no_aggregate(fatal, type, comm, name, error_code, arglist);
}
/* In most instances the communicator will be valid. If not, we are either early in
@ -392,9 +459,9 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
comm = &ompi_mpi_comm_self.comm;
}
if (NULL != error_code) {
ompi_mpi_abort(comm, *error_code);
} else {
ompi_mpi_abort(comm, 1);
}
if (NULL != error_code)
err = *error_code;
/* Call abort without a specified comm to force RTE Job termination */
ompi_mpi_abort(fatal? NULL: comm, err);
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -35,6 +35,16 @@ OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **f
OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
int *error_code, ...);
/**
* Handler function for MPI_ERRORS_ABORT
*/
OMPI_DECLSPEC void ompi_mpi_errors_abort_comm_handler(struct ompi_communicator_t **comm,
int *error_code, ...);
OMPI_DECLSPEC void ompi_mpi_errors_abort_file_handler(struct ompi_file_t **file,
int *error_code, ...);
OMPI_DECLSPEC void ompi_mpi_errors_abort_win_handler(struct ompi_win_t **win,
int *error_code, ...);
/**
* Handler function for MPI_ERRORS_RETURN
*/

Просмотреть файл

@ -3,7 +3,7 @@
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# Copyright (c) 2004-2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -28,7 +28,7 @@
%s *** on %s %s
%s *** %s
%s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,
%s *** and potentially your MPI job)
%s *** and MPI will try to terminate your MPI job as well)
#
[mpi_errors_are_fatal unknown handle]
%s *** An error occurred %s %s
@ -36,5 +36,13 @@
%s *** on a NULL %s
%s *** %s
%s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,
%s *** and potentially your MPI job)
%s *** and MPI will try to terminate your MPI job as well)
#
[mpi_errors_abort]
%s *** An error occurred %s %s
%s *** reported by process [%lu,%lu]
%s *** on %s %s
%s *** %s
%s *** MPI_ERRORS_ABORT (processes in this %s will now abort,
%s *** and potentially the rest of your MPI job)
#

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@ -658,6 +658,7 @@ enum {
#define MPI_ERR_RMA_SHARED 71
#define MPI_T_ERR_INVALID 72
#define MPI_T_ERR_INVALID_NAME 73
#define MPI_ERR_PROC_ABORTED 74
/* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
MPI_ERR_<foo> code. Set the last code to allow some room for adding
@ -1046,6 +1047,7 @@ OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_c_long_double_co
OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errhandler_null;
OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_are_fatal;
OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_abort;
OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_return;
OMPI_DECLSPEC extern struct ompi_predefined_win_t ompi_mpi_win_null;
@ -1242,6 +1244,7 @@ OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_ub;
#define MPI_COUNT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_count)
#define MPI_ERRORS_ARE_FATAL OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_are_fatal)
#define MPI_ERRORS_ABORT OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_abort)
#define MPI_ERRORS_RETURN OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_return)
/* Typeclass definition for MPI_Type_match_size */

Просмотреть файл

@ -4,6 +4,9 @@
# Copyright (c) 2016-2019 Research Organization for Information Science
# and Technology (RIST). All rights reserved.
# Copyright (c) 2016-2018 FUJITSU LIMITED. All rights reserved.
# Copyright (c) 2020 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -93,6 +96,7 @@ $handles->{MPI_COMM_SELF} = 1;
$handles->{MPI_GROUP_EMPTY} = 1;
$handles->{MPI_ERRORS_ARE_FATAL} = 1;
$handles->{MPI_ERRORS_RETURN} = 2;
$handles->{MPI_ERRORS_ABORT} = 3;
$handles->{MPI_MAX} = 1;
$handles->{MPI_MIN} = 2;
@ -312,6 +316,7 @@ $constants->{MPI_ERR_NOT_SAME} = 40;
$constants->{MPI_ERR_NO_SPACE} = 41;
$constants->{MPI_ERR_NO_SUCH_FILE} = 42;
$constants->{MPI_ERR_PORT} = 43;
$constants->{MPI_ERR_PROC_ABORTED} = 74;
$constants->{MPI_ERR_QUOTA} = 44;
$constants->{MPI_ERR_READ_ONLY} = 45;
$constants->{MPI_ERR_RMA_CONFLICT} = 46;

Просмотреть файл

@ -6,6 +6,9 @@
! Copyright (c) 2015-2019 Research Organization for Information Science
! and Technology (RIST). All rights reserved.
! Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
! Copyright (c) 2020 The University of Tennessee and The University
! of Tennessee Research Foundation. All rights
! reserved.
! $COPYRIGHT$
!
! This file creates mappings between MPI C types (e.g., MPI_Comm) and
@ -85,6 +88,7 @@ module mpi_f08_types
type(MPI_Group), parameter :: MPI_GROUP_EMPTY = MPI_Group(OMPI_MPI_GROUP_EMPTY)
type(MPI_Errhandler), parameter :: MPI_ERRORS_ARE_FATAL = MPI_Errhandler(OMPI_MPI_ERRORS_ARE_FATAL)
type(MPI_Errhandler), parameter :: MPI_ERRORS_ABORT = MPI_Errhandler(OMPI_MPI_ERRORS_ABORT)
type(MPI_Errhandler), parameter :: MPI_ERRORS_RETURN = MPI_Errhandler(OMPI_MPI_ERRORS_RETURN)
type(MPI_Message), parameter :: MPI_MESSAGE_NO_PROC = MPI_Message(OMPI_MPI_MESSAGE_NO_PROC)

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -160,6 +160,7 @@ JNIEXPORT void JNICALL Java_mpi_Constant_setConstant(JNIEnv *env, jobject obj)
ompi_java_setIntField(env, c, obj, "ERR_NO_SPACE", MPI_ERR_NO_SPACE);
ompi_java_setIntField(env, c, obj, "ERR_NO_SUCH_FILE", MPI_ERR_NO_SUCH_FILE);
ompi_java_setIntField(env, c, obj, "ERR_PORT", MPI_ERR_PORT);
ompi_java_setIntField(env, c, obj, "ERR_PROC_ABORTED", MPI_ERR_PROC_ABORTED);
ompi_java_setIntField(env, c, obj, "ERR_QUOTA", MPI_ERR_QUOTA);
ompi_java_setIntField(env, c, obj, "ERR_READ_ONLY", MPI_ERR_READ_ONLY);
ompi_java_setIntField(env, c, obj, "ERR_RMA_CONFLICT", MPI_ERR_RMA_CONFLICT);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -59,6 +59,11 @@ JNIEXPORT jlong JNICALL Java_mpi_Errhandler_getFatal(JNIEnv *env, jclass clazz)
return (jlong)MPI_ERRORS_ARE_FATAL;
}
JNIEXPORT jlong JNICALL Java_mpi_Errhandler_getAbort(JNIEnv *env, jclass clazz)
{
return (jlong)MPI_ERRORS_ABORT;
}
JNIEXPORT jlong JNICALL Java_mpi_Errhandler_getReturn(JNIEnv *env, jclass clazz)
{
return (jlong)MPI_ERRORS_RETURN;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -98,6 +98,7 @@ class Constant
protected int ERR_NO_SPACE;
protected int ERR_NO_SUCH_FILE;
protected int ERR_PORT;
protected int ERR_PROC_ABORTED;
protected int ERR_QUOTA;
protected int ERR_READ_ONLY;
protected int ERR_RMA_CONFLICT;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -141,7 +141,7 @@ public final class MPI
MODE_NOSTORE, MODE_NOSUCCEED;
public static final int LOCK_EXCLUSIVE, LOCK_SHARED;
public static final Errhandler ERRORS_ARE_FATAL, ERRORS_RETURN;
public static final Errhandler ERRORS_ARE_FATAL, ERRORS_ABORT, ERRORS_RETURN;
// Error classes and codes
public static final int SUCCESS;
@ -188,6 +188,7 @@ public final class MPI
public static final int ERR_NO_SPACE;
public static final int ERR_NO_SUCH_FILE;
public static final int ERR_PORT;
public static final int ERR_PROC_ABORTED;
public static final int ERR_QUOTA;
public static final int ERR_READ_ONLY;
public static final int ERR_RMA_CONFLICT;
@ -332,6 +333,7 @@ public final class MPI
LOCK_SHARED = c.LOCK_SHARED;
ERRORS_ARE_FATAL = new Errhandler(Errhandler.getFatal());
ERRORS_ABORT = new Errhandler(Errhandler.getAbort());
ERRORS_RETURN = new Errhandler(Errhandler.getReturn());
COMM_WORLD = new Intracomm();
@ -382,6 +384,7 @@ public final class MPI
ERR_NO_SPACE = c.ERR_NO_SPACE;
ERR_NO_SUCH_FILE = c.ERR_NO_SUCH_FILE;
ERR_PORT = c.ERR_PORT;
ERR_PROC_ABORTED = c.ERR_PROC_ABORTED;
ERR_QUOTA = c.ERR_QUOTA;
ERR_READ_ONLY = c.ERR_READ_ONLY;
ERR_RMA_CONFLICT = c.ERR_RMA_CONFLICT;

Просмотреть файл

@ -107,6 +107,7 @@ Standard error return classes for Open MPI:
| MPI_ERR_NO_SPACE | 41 | Not enough space. |
| MPI_ERR_NO_SUCH_FILE | 42 | File (or directory) does not exist. |
| MPI_ERR_PORT | 43 | Invalid port. |
| MPI_ERR_PROC_ABORTED | 74 | Operation failed because a remote peer has aborted. |
| MPI_ERR_QUOTA | 44 | Quota exceeded. |
| MPI_ERR_READ_ONLY | 45 | Read-only file system. |
| MPI_ERR_RMA_CONFLICT | 46 | Conflicting accesses to window. |

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -69,6 +69,7 @@ static bool have_been_invoked = false;
* It would be nifty if we could differentiate between the
* abort scenarios (but we don't, currently):
* - MPI_Abort()
* - MPI_ERRORS_ABORT
* - MPI_ERRORS_ARE_FATAL
* - Victim of MPI_Abort()
*/
@ -182,7 +183,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT &&
NULL != comm) {
try_kill_peers(comm, errcode);
try_kill_peers(comm, errcode); /* kill only the specified groups, no return if it worked. */
}
/* We can fall through to here in a few cases:

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn \
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort comm_abort simple_spawn \
concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \
bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \
crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \

174
test/simple/comm_abort.c Обычный файл
Просмотреть файл

@ -0,0 +1,174 @@
/* -*- C -*-
* Copyright (c) 2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* Test aborting communicators
*/
#include <stdio.h>
#include <unistd.h>
#include "mpi.h"
#define print1(format...) if(0 == rank) printf(format)
int main(int argc, char* argv[])
{
int rank, size, more;
double start, now;
MPI_Comm comm_pair_fatal, comm_pair_return, comm_pair_abort;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if(0 == rank && size%2) {
fprintf(stderr, "This test requires an even number of processes\n\n");
MPI_Abort(MPI_COMM_WORLD, size);
}
/* Setup: split our world in a set of 2-processes islands */
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_fatal);
MPI_Comm_set_errhandler(comm_pair_fatal, MPI_ERRORS_ARE_FATAL);
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_return);
MPI_Comm_set_errhandler(comm_pair_return, MPI_ERRORS_RETURN);
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_abort);
/* If this code fails to compile, the MPI implementation is not compliant
* with MPI-4 (TODO: add ref to chapter/line when MPI-4 published). */
MPI_Comm_set_errhandler(comm_pair_abort, MPI_ERRORS_ABORT);
MPI_Barrier(MPI_COMM_WORLD);
print1(
"This program will test partial abort functionality (communicator scoped abort).\n"
" Each test will perform a loop of communication on a subcommunicator for about\n"
" 1 second between printouts, and then, a 1 second cooldown.\n");
print1("\n\n"
"Test1: MPI_Abort(MPI_COMM_SELF) aborts only one process?\n"
" In a high quality implementation, all ranks except %d\n"
" should report their presence.\n", 1);
if(rank == 1) {
MPI_Abort(MPI_COMM_SELF, 1);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 1) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
printf(" This is rank %d: still kickin after %d MPI_Abort'ed self\n", rank, 1);
sleep(1);
print1("===============================================================\n");
print1("\n\n"
"Test2: MPI_Abort(comm) aborts all processes in comm?\n"
" In a high quality implementation, all ranks except %d--%d\n"
" should report their presence.\n", 1, 3);
if(rank == 3) {
MPI_Abort(comm_pair_return, 2);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 3) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 3, 2, 3);
/* This process should have aborted, give it an opportunity to do so if no
* async progress: message to self to spin MPI progress. */
if(rank == 2) {
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
&now, 1, MPI_DOUBLE, 0, 0,
MPI_COMM_SELF, MPI_STATUS_IGNORE);
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 2);
}
sleep(1);
print1("===============================================================\n");
print1("\n\n"
"Test3: MPI_ERRORS_ABORT aborts all processes in comm?\n"
" In a high quality implementation, all ranks except %d--%d\n"
" should report their presence.\n", 1, 5);
if(rank == 5) {
MPI_Comm_call_errhandler(comm_pair_abort, 3);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 5) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 5, 4, 5);
/* This process should have aborted, give it an opportunity to do so if no
* async progress: message to self to spin MPI progress. */
if(rank == 4) {
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
&now, 1, MPI_DOUBLE, 0, 0,
MPI_COMM_SELF, MPI_STATUS_IGNORE);
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 4);
}
sleep(1);
print1("===============================================================\n");
print1("\n\n"\
"Test4: Communicating with an aborted process %d returns a good error code?\n"
" In a high quality implementation, rank %d should print an error string;\n"
" In a higher quality implementation the error should be of class\n"
" MPI_ERR_PROC_ABORTED.\n", 1, 0);
if(rank == 0) {
int err, class, slen;
char str[MPI_MAX_ERROR_STRING];
/* remember, 1 aborted in test1 */
MPI_Error_class(err, &class);
MPI_Error_string(err, str, &slen);
err = MPI_Recv(&more, 1, MPI_INT, 1, 0, comm_pair_return, MPI_STATUS_IGNORE);
printf(" This is rank %d: Recv(from=%d) returned code=%d: class=%d: %s\n", 0, 1, err, class, str);
}
sleep(1);
print1("===============================================================\n");
print1("\n\n"
"Test5: MPI_ERRORS_ARE_FATAL aborts all processes?\n");
if(rank == 0) {
MPI_Comm_call_errhandler(comm_pair_fatal, 5);
}
/* Spin on communication for 1 second to let time for Abort to have an
* effect, if any. */
more = 1; start = MPI_Wtime();
do {
now = MPI_Wtime();
if(now - start > 1.) more = 0;
if(rank > 5) /* don't reduce on aborted pairs */
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
} while(more);
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
&now, 1, MPI_DOUBLE, 0, 0,
MPI_COMM_SELF, MPI_STATUS_IGNORE);
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", rank);
/* Should never get there */
MPI_Finalize();
return 0;
}