From 6a3e781d5c3820f884c62af908e4f53201270252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Mon, 4 May 2020 14:39:14 -0400 Subject: [PATCH 1/3] Add ERR_PROC_ABORTED (compliance with mpi-next) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller --- ompi/errhandler/errcode.c | 5 ++++- ompi/include/mpi.h.in | 3 ++- ompi/include/mpif-values.pl | 4 ++++ ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 | 3 +++ ompi/mpi/java/c/mpi_Constant.c | 3 ++- ompi/mpi/java/java/Constant.java | 3 ++- ompi/mpi/java/java/MPI.java | 4 +++- ompi/mpi/man/man5/Open-MPI.5.md | 1 + 8 files changed, 21 insertions(+), 5 deletions(-) diff --git a/ompi/errhandler/errcode.c b/ompi/errhandler/errcode.c index 03225085b3..91c430d91f 100644 --- a/ompi/errhandler/errcode.c +++ b/ompi/errhandler/errcode.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -85,6 +85,7 @@ static ompi_mpi_errcode_t ompi_err_not_same; static ompi_mpi_errcode_t ompi_err_no_space; static ompi_mpi_errcode_t ompi_err_no_such_file; static ompi_mpi_errcode_t ompi_err_port; +static ompi_mpi_errcode_t ompi_err_proc_aborted; static ompi_mpi_errcode_t ompi_err_quota; static ompi_mpi_errcode_t ompi_err_read_only; static ompi_mpi_errcode_t ompi_err_rma_conflict; @@ -186,6 +187,7 @@ int ompi_mpi_errcode_init (void) CONSTRUCT_ERRCODE( ompi_err_no_space, MPI_ERR_NO_SPACE, "MPI_ERR_NO_SPACE: no space left on device" ); CONSTRUCT_ERRCODE( ompi_err_no_such_file, MPI_ERR_NO_SUCH_FILE, "MPI_ERR_NO_SUCH_FILE: no such file or directory" ); CONSTRUCT_ERRCODE( ompi_err_port, MPI_ERR_PORT, "MPI_ERR_PORT: invalid port" ); + CONSTRUCT_ERRCODE( ompi_err_proc_aborted, MPI_ERR_PROC_ABORTED, "MPI_ERR_PROC_ABORTED: operation failed because a remote peer has aborted" ); CONSTRUCT_ERRCODE( ompi_err_quota, MPI_ERR_QUOTA, "MPI_ERR_QUOTA: out of quota" ); CONSTRUCT_ERRCODE( ompi_err_read_only, MPI_ERR_READ_ONLY, "MPI_ERR_READ_ONLY: file is read only" ); CONSTRUCT_ERRCODE( ompi_err_rma_conflict, MPI_ERR_RMA_CONFLICT, "MPI_ERR_RMA_CONFLICT: rma conflict during operation" ); @@ -282,6 +284,7 @@ int ompi_mpi_errcode_finalize(void) OBJ_DESTRUCT(&ompi_err_no_space); OBJ_DESTRUCT(&ompi_err_no_such_file); OBJ_DESTRUCT(&ompi_err_port); + OBJ_DESTRUCT(&ompi_err_proc_aborted); OBJ_DESTRUCT(&ompi_err_quota); OBJ_DESTRUCT(&ompi_err_read_only); OBJ_DESTRUCT(&ompi_err_rma_conflict); diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index d4851dd8ea..d04e46010a 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -640,6 +640,7 @@ enum { #define MPI_ERR_RMA_SHARED 71 #define MPI_T_ERR_INVALID 72 #define MPI_T_ERR_INVALID_NAME 73 +#define MPI_ERR_PROC_ABORTED 74 /* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined MPI_ERR_ code. Set the last code to allow some room for adding diff --git a/ompi/include/mpif-values.pl b/ompi/include/mpif-values.pl index 7e6bd742d2..91133fe8f7 100755 --- a/ompi/include/mpif-values.pl +++ b/ompi/include/mpif-values.pl @@ -4,6 +4,9 @@ # Copyright (c) 2016-2019 Research Organization for Information Science # and Technology (RIST). All rights reserved. # Copyright (c) 2016-2018 FUJITSU LIMITED. All rights reserved. +# Copyright (c) 2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -312,6 +315,7 @@ $constants->{MPI_ERR_NOT_SAME} = 40; $constants->{MPI_ERR_NO_SPACE} = 41; $constants->{MPI_ERR_NO_SUCH_FILE} = 42; $constants->{MPI_ERR_PORT} = 43; +$constants->{MPI_ERR_PROC_ABORTED} = 74; $constants->{MPI_ERR_QUOTA} = 44; $constants->{MPI_ERR_READ_ONLY} = 45; $constants->{MPI_ERR_RMA_CONFLICT} = 46; diff --git a/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 b/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 index 7fb8985504..46f8ec3ac4 100644 --- a/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 +++ b/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 @@ -6,6 +6,9 @@ ! Copyright (c) 2015-2019 Research Organization for Information Science ! and Technology (RIST). All rights reserved. ! Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. +! Copyright (c) 2020 The University of Tennessee and The University +! of Tennessee Research Foundation. All rights +! reserved. ! $COPYRIGHT$ ! ! This file creates mappings between MPI C types (e.g., MPI_Comm) and diff --git a/ompi/mpi/java/c/mpi_Constant.c b/ompi/mpi/java/c/mpi_Constant.c index 20d180b8e6..06884743e6 100644 --- a/ompi/mpi/java/c/mpi_Constant.c +++ b/ompi/mpi/java/c/mpi_Constant.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -160,6 +160,7 @@ JNIEXPORT void JNICALL Java_mpi_Constant_setConstant(JNIEnv *env, jobject obj) ompi_java_setIntField(env, c, obj, "ERR_NO_SPACE", MPI_ERR_NO_SPACE); ompi_java_setIntField(env, c, obj, "ERR_NO_SUCH_FILE", MPI_ERR_NO_SUCH_FILE); ompi_java_setIntField(env, c, obj, "ERR_PORT", MPI_ERR_PORT); + ompi_java_setIntField(env, c, obj, "ERR_PROC_ABORTED", MPI_ERR_PROC_ABORTED); ompi_java_setIntField(env, c, obj, "ERR_QUOTA", MPI_ERR_QUOTA); ompi_java_setIntField(env, c, obj, "ERR_READ_ONLY", MPI_ERR_READ_ONLY); ompi_java_setIntField(env, c, obj, "ERR_RMA_CONFLICT", MPI_ERR_RMA_CONFLICT); diff --git a/ompi/mpi/java/java/Constant.java b/ompi/mpi/java/java/Constant.java index e3e47fb90a..a5e95708b7 100644 --- a/ompi/mpi/java/java/Constant.java +++ b/ompi/mpi/java/java/Constant.java @@ -2,7 +2,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -98,6 +98,7 @@ class Constant protected int ERR_NO_SPACE; protected int ERR_NO_SUCH_FILE; protected int ERR_PORT; + protected int ERR_PROC_ABORTED; protected int ERR_QUOTA; protected int ERR_READ_ONLY; protected int ERR_RMA_CONFLICT; diff --git a/ompi/mpi/java/java/MPI.java b/ompi/mpi/java/java/MPI.java index a5e96e0b04..9ff0482146 100644 --- a/ompi/mpi/java/java/MPI.java +++ b/ompi/mpi/java/java/MPI.java @@ -2,7 +2,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -188,6 +188,7 @@ public final class MPI public static final int ERR_NO_SPACE; public static final int ERR_NO_SUCH_FILE; public static final int ERR_PORT; + public static final int ERR_PROC_ABORTED; public static final int ERR_QUOTA; public static final int ERR_READ_ONLY; public static final int ERR_RMA_CONFLICT; @@ -382,6 +383,7 @@ public final class MPI ERR_NO_SPACE = c.ERR_NO_SPACE; ERR_NO_SUCH_FILE = c.ERR_NO_SUCH_FILE; ERR_PORT = c.ERR_PORT; + ERR_PROC_ABORTED = c.ERR_PROC_ABORTED; ERR_QUOTA = c.ERR_QUOTA; ERR_READ_ONLY = c.ERR_READ_ONLY; ERR_RMA_CONFLICT = c.ERR_RMA_CONFLICT; diff --git a/ompi/mpi/man/man5/Open-MPI.5.md b/ompi/mpi/man/man5/Open-MPI.5.md index a2f474e065..0748efc698 100644 --- a/ompi/mpi/man/man5/Open-MPI.5.md +++ b/ompi/mpi/man/man5/Open-MPI.5.md @@ -107,6 +107,7 @@ Standard error return classes for Open MPI: | MPI_ERR_NO_SPACE | 41 | Not enough space. | | MPI_ERR_NO_SUCH_FILE | 42 | File (or directory) does not exist. | | MPI_ERR_PORT | 43 | Invalid port. | +| MPI_ERR_PROC_ABORTED | 74 | Operation failed because a remote peer has aborted. | | MPI_ERR_QUOTA | 44 | Quota exceeded. | | MPI_ERR_READ_ONLY | 45 | Read-only file system. | | MPI_ERR_RMA_CONFLICT | 46 | Conflicting accesses to window. | From 9c22ad84aff1a4f687b7f58543afb52d13631bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Tue, 5 May 2020 09:36:44 -0400 Subject: [PATCH 2/3] Add the MPI_ERRORS_ABORT predefined error handler (conformance with mpi-next) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller Ordering must match fortran definition index for errhandlers, and we don't want to change the old ones. Signed-off-by: Aurélien Bouteiller --- contrib/check_unnecessary_headers.sh | 4 +- ompi/debuggers/MPI_Handles_interface.txt | 4 +- ompi/debuggers/ompi_mpihandles_dll.c | 4 +- ompi/errhandler/errhandler.c | 18 ++- ompi/errhandler/errhandler.h | 11 +- ompi/errhandler/errhandler_invoke.c | 6 +- ompi/errhandler/errhandler_predefined.c | 115 ++++++++++++++---- ompi/errhandler/errhandler_predefined.h | 12 +- ompi/errhandler/help-mpi-errors.txt | 14 ++- ompi/include/mpi.h.in | 2 + ompi/include/mpif-values.pl | 1 + .../fortran/use-mpi-f08/mod/mpi-f08-types.F90 | 1 + ompi/mpi/java/c/mpi_Errhandler.c | 7 +- ompi/mpi/java/java/MPI.java | 3 +- ompi/runtime/ompi_mpi_abort.c | 5 +- 15 files changed, 164 insertions(+), 43 deletions(-) diff --git a/contrib/check_unnecessary_headers.sh b/contrib/check_unnecessary_headers.sh index 601121af31..af2c786ed8 100644 --- a/contrib/check_unnecessary_headers.sh +++ b/contrib/check_unnecessary_headers.sh @@ -3,7 +3,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -186,7 +186,7 @@ SEARCH_HEADER[5]="ompi/datatype/convertor.h OMPI_COMM_INTER OMPI_COMM_CART OMPI_ SEARCH_HEADER[6]="ompi/datatype/datatype.h MPI_Datatype DT_MAX_PREDEFINED DT_FLAG_ MAX_DT_COMPONENT_COUNT opal_ddt_count_t dt_type_desc_t ompi_datatype_t ompi_predefined_datatype_t ompi_ddt_init ompi_ddt_finalize ompi_ddt_create_ ompi_ddt_duplicate ompi_ddt_is_predefined ompi_ddt_create_from_packed_description" SEARCH_HEADER[7]="ompi/datatype/datatype_internal.h DDT_DUMP_STACK DT_ ddt_elem_id_description ddt_elem_desc ddt_elem_desc_t ddt_loop_desc ddt_loop_desc_t ddt_endloop_desc ddt_endloop_desc_t dt_elem_desc CREATE_LOOP_START CREATE_LOOP_END CREATE_ELEM ompi_complex_float_t ompi_complex_double_t ompi_complex_long_double_t ompi_ddt_basicDatatypes BASIC_DDT_FROM_ELEM ompi_ddt_default_convertors_init ompi_ddt_default_convertors_fini SAVE_STACK PUSH_STACK ompi_ddt_safeguard_pointer_debug_breakpoint OMPI_DDT_SAFEGUARD_POINTER GET_FIRST_NON_LOOP UPDATE_INTERNAL_COUNTERS ompi_ddt_print_args" SEARCH_HEADER[8]="ompi/errhandler/errhandler.h OMPI_ERRHANDLER_LANG_ ompi_errhandler_lang_t OMPI_ERRHANDLER_TYPE_ ompi_errhandler_type_t ompi_errhandler_t ompi_predefined_errhandler_t ompi_mpi_errhandler_null OMPI_ERRHANDLER_CHECK OMPI_ERRHANDLER_RETURN ompi_errhandler_init ompi_errhandler_finalize OMPI_ERRHANDLER_INVOKE ompi_errhandler_invoke ompi_errhandler_request_invoke ompi_errhandler_create ompi_errhandler_is_intrinsic ompi_errhandler_fortran_handler_fn_t OMPI_ERR_INIT_FINALIZE MPI_Errhandler" -SEARCH_HEADER[9]="ompi/errhandler/errhandler_predefined.h ompi_mpi_errors_are_fatal_ ompi_mpi_errors_return_ ompi_mpi_errors_throw_exceptions" +SEARCH_HEADER[9]="ompi/errhandler/errhandler_predefined.h ompi_mpi_errors_are_fatal_ ompi_mpi_errors_return_ ompi_mpi_errors_abort_ ompi_mpi_errors_throw_exceptions" ### SEARCH_HEADER[10]="ompi/file/file.h OMPI_FILE_ISCLOSED OMPI_FILE_HIDDEN ompi_file_t ompi_predefined_file_t ompi_mpi_file_null ompi_file_f_to_c_table ompi_file_init ompi_file_open ompi_file_set_name ompi_file_close ompi_file_finalize ompi_file_invalid MPI_File MPI_FILE_NULL ompi_mpi_cxx_file_errhandler_invoke" # THE LAST ONE WAS FOR THE CXX INTERFACE SEARCH_HEADER[11]="ompi/group/group.h ompi_group_sporadic_list_t ompi_group_sporadic_data_t ompi_group_strided_data_t ompi_group_bitmap_data_t ompi_group_t ompi_predefined_group_t OMPI_GROUP_ ompi_group_f_to_c_table ompi_mpi_group_null ompi_group_allocate ompi_group_increment_proc_count ompi_group_decrement_proc_count ompi_group_size ompi_group_rank ompi_set_group_rank ompi_group_translate_ranks ompi_group_free ompi_group_get_proc_ptr ompi_group_calc_ ompi_group_peer_lookup ompi_group_div_ceil MPI_Group" diff --git a/ompi/debuggers/MPI_Handles_interface.txt b/ompi/debuggers/MPI_Handles_interface.txt index f81bef8528..863425bd22 100644 --- a/ompi/debuggers/MPI_Handles_interface.txt +++ b/ompi/debuggers/MPI_Handles_interface.txt @@ -314,8 +314,8 @@ C++: MPI::Errhandler MPI allows applications to define their own error handlers. The default error handler is to abort the MPI job. Error handlers can be -attached to communicators, files, and windows. There are 3 predefined -error handlers (MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN, +attached to communicators, files, and windows. There are 4 predefined +error handlers (MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN, MPI_ERRORS_ABORT, MPI::ERRORS_THROW_EXCEPTIONS), and applications can create their own error handlers. diff --git a/ompi/debuggers/ompi_mpihandles_dll.c b/ompi/debuggers/ompi_mpihandles_dll.c index 131040b57f..ff6a65e4cf 100644 --- a/ompi/debuggers/ompi_mpihandles_dll.c +++ b/ompi/debuggers/ompi_mpihandles_dll.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. @@ -384,6 +384,8 @@ int mpidbg_init_per_process(mqs_process *process, int i = 0; fill_map(image, "MPI_ERRORS_ARE_FATAL", "ompi_mpi_errors_are_fatal", &mpidbg_errhandler_name_map[i++]); + fill_map(image, "MPI_ERRORS_ABORT", "ompi_mpi_errors_abort", + &mpidbg_errhandler_name_map[i++]); fill_map(image, "MPI_ERRORS_RETURN", "ompi_mpi_errors_return", &mpidbg_errhandler_name_map[i++]); fill_map(image, "MPI_ERRHANDLER_NULL", "ompi_mpi_errhandler_null", diff --git a/ompi/errhandler/errhandler.c b/ompi/errhandler/errhandler.c index b81a4e3006..50d4c60fd5 100644 --- a/ompi/errhandler/errhandler.c +++ b/ompi/errhandler/errhandler.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -68,6 +68,9 @@ ompi_predefined_errhandler_t *ompi_mpi_errhandler_null_addr = ompi_predefined_errhandler_t ompi_mpi_errors_are_fatal = {{{0}}}; ompi_predefined_errhandler_t *ompi_mpi_errors_are_fatal_addr = &ompi_mpi_errors_are_fatal; +ompi_predefined_errhandler_t ompi_mpi_errors_abort = {{{0}}}; +ompi_predefined_errhandler_t *ompi_mpi_errors_abort_addr = + &ompi_mpi_errors_abort; ompi_predefined_errhandler_t ompi_mpi_errors_return = {{{0}}}; ompi_predefined_errhandler_t *ompi_mpi_errors_return_addr = &ompi_mpi_errors_return; @@ -127,6 +130,19 @@ int ompi_errhandler_init(void) opal_string_copy(ompi_mpi_errors_return.eh.eh_name, "MPI_ERRORS_RETURN", sizeof(ompi_mpi_errors_return.eh.eh_name)); + OBJ_CONSTRUCT( &ompi_mpi_errors_abort.eh, ompi_errhandler_t ); + if( ompi_mpi_errors_abort.eh.eh_f_to_c_index != OMPI_ERRORS_ABORT_FORTRAN ) + return OMPI_ERROR; + ompi_mpi_errors_abort.eh.eh_mpi_object_type = OMPI_ERRHANDLER_TYPE_PREDEFINED; + ompi_mpi_errors_abort.eh.eh_lang = OMPI_ERRHANDLER_LANG_C; + ompi_mpi_errors_abort.eh.eh_comm_fn = ompi_mpi_errors_abort_comm_handler; + ompi_mpi_errors_abort.eh.eh_file_fn = ompi_mpi_errors_abort_file_handler; + ompi_mpi_errors_abort.eh.eh_win_fn = ompi_mpi_errors_abort_win_handler ; + ompi_mpi_errors_abort.eh.eh_fort_fn = NULL; + opal_string_copy(ompi_mpi_errors_abort.eh.eh_name, + "MPI_ERRORS_ABORT", + sizeof(ompi_mpi_errors_abort.eh.eh_name)); + /* If we're going to use C++, functions will be fixed up during MPI::Init. Note that it is proper to use ERRHANDLER_LANG_C here; the dispatch function is in C (although in libmpi_cxx); the diff --git a/ompi/errhandler/errhandler.h b/ompi/errhandler/errhandler.h index 558478225c..1df48c32a4 100644 --- a/ompi/errhandler/errhandler.h +++ b/ompi/errhandler/errhandler.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -49,7 +49,8 @@ BEGIN_C_DECLS enum { OMPI_ERRHANDLER_NULL_FORTRAN = 0, OMPI_ERRORS_ARE_FATAL_FORTRAN, - OMPI_ERRORS_RETURN_FORTRAN + OMPI_ERRORS_RETURN_FORTRAN, + OMPI_ERRORS_ABORT_FORTRAN, }; @@ -167,6 +168,12 @@ OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_are_fatal_add OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_return; OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_return_addr; +/* + * Global variable for MPI_ERRORS_ABORT (_addr flavor is for F03 bindings) + */ +OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_abort; +OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_abort_addr; + /** * Global variable for MPI::ERRORS_THROW_EXCEPTIONS. Will abort if * MPI_INIT wasn't called as MPI::INIT (_addr flavor is for F03 bindings) diff --git a/ompi/errhandler/errhandler_invoke.c b/ompi/errhandler/errhandler_invoke.c index 789308865c..ad96696715 100644 --- a/ompi/errhandler/errhandler_invoke.c +++ b/ompi/errhandler/errhandler_invoke.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -41,10 +41,10 @@ int ompi_errhandler_invoke(ompi_errhandler_t *errhandler, void *mpi_object, ompi_win_t *win; ompi_file_t *file; - /* If we got no errorhandler, then just invoke errors_abort */ + /* If we got no errorhandler, then just invoke errors_are_fatal */ if (NULL == errhandler) { ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, message); - return err_code; + return err_code; } /* Figure out what kind of errhandler it is, figure out if it's diff --git a/ompi/errhandler/errhandler_predefined.c b/ompi/errhandler/errhandler_predefined.c index f46f79a223..4c9353465f 100644 --- a/ompi/errhandler/errhandler_predefined.c +++ b/ompi/errhandler/errhandler_predefined.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -47,7 +47,7 @@ /* * Local functions */ -static void backend_fatal(char *type, struct ompi_communicator_t *comm, +static void backend_abort(int fatal, char *type, struct ompi_communicator_t *comm, char *name, int *error_code, va_list arglist); static void out(char *str, char *arg); @@ -68,7 +68,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm, name = NULL; abort_comm = NULL; } - backend_fatal("communicator", abort_comm, name, error_code, arglist); + backend_abort(true, "communicator", abort_comm, name, error_code, arglist); va_end(arglist); } @@ -89,7 +89,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file, name = NULL; abort_comm = NULL; } - backend_fatal("file", abort_comm, name, error_code, arglist); + backend_abort(true, "file", abort_comm, name, error_code, arglist); va_end(arglist); } @@ -108,7 +108,67 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win, } else { name = NULL; } - backend_fatal("win", abort_comm, name, error_code, arglist); + backend_abort(true, "win", abort_comm, name, error_code, arglist); + va_end(arglist); +} + +void ompi_mpi_errors_abort_comm_handler(struct ompi_communicator_t **comm, + int *error_code, ...) +{ + char *name; + struct ompi_communicator_t *abort_comm; + va_list arglist; + + va_start(arglist, error_code); + + if ( (NULL != comm) && (NULL != *comm) ) { + name = (*comm)->c_name; + abort_comm = *comm; + } else { + name = NULL; + abort_comm = NULL; + } + backend_abort(false, "communicator", abort_comm, name, error_code, arglist); + va_end(arglist); +} + + +void ompi_mpi_errors_abort_file_handler(struct ompi_file_t **file, + int *error_code, ...) +{ + char *name; + struct ompi_communicator_t *abort_comm; + va_list arglist; + + va_start(arglist, error_code); + + if (NULL != file) { + name = (*file)->f_filename; + abort_comm = (*file)->f_comm; + } else { + name = NULL; + abort_comm = NULL; + } + backend_abort(false, "file", abort_comm, name, error_code, arglist); + va_end(arglist); +} + + +void ompi_mpi_errors_abort_win_handler(struct ompi_win_t **win, + int *error_code, ...) +{ + char *name; + struct ompi_communicator_t *abort_comm = NULL; + va_list arglist; + + va_start(arglist, error_code); + + if (NULL != win) { + name = (*win)->w_name; + } else { + name = NULL; + } + backend_abort(false, "win", abort_comm, name, error_code, arglist); va_end(arglist); } @@ -175,7 +235,7 @@ static void out(char *str, char *arg) * there's no need to handle the pre-MPI_INIT and post-MPI_FINALIZE * errors here. */ -static void backend_fatal_aggregate(char *type, +static void backend_abort_aggregate(int fatal, char *type, struct ompi_communicator_t *comm, char *name, int *error_code, va_list arglist) @@ -199,7 +259,7 @@ static void backend_fatal_aggregate(char *type, ompi_process_info.nodename, (int) ompi_process_info.pid) == -1) { prefix = NULL; - // non-fatal, we could still go on to give useful information here... + // non-abort, we could still go on to give useful information here... opal_output(0, "%s", "Could not write node and PID to prefix"); opal_output(0, "Node: %s", ompi_process_info.nodename); opal_output(0, "PID: %d", (int) ompi_process_info.pid); @@ -224,7 +284,7 @@ static void backend_fatal_aggregate(char *type, if (NULL != name) { opal_show_help("help-mpi-errors.txt", - "mpi_errors_are_fatal", + fatal? "mpi_errors_are_fatal": "mpi_errors_abort", false, usable_prefix, (NULL == arg) ? "" : "in", @@ -267,7 +327,7 @@ static void backend_fatal_aggregate(char *type, /* * Note that this function has to handle pre-MPI_INIT and - * post-MPI_FINALIZE errors, which backend_fatal_aggregate() does not + * post-MPI_FINALIZE errors, which backend_abort_aggregate() does not * have to handle. * * This function also intentionally does not call malloc(), just in @@ -275,7 +335,7 @@ static void backend_fatal_aggregate(char *type, * we *might* be able to get a message out if we're not further * corrupting the stack by calling malloc()... */ -static void backend_fatal_no_aggregate(char *type, +static void backend_abort_no_aggregate(int fatal, char *type, struct ompi_communicator_t *comm, char *name, int *error_code, va_list arglist) @@ -303,7 +363,7 @@ static void backend_fatal_no_aggregate(char *type, "*** Unfortunately, no further information is available on *which* MPI\n" "*** function was invoked, sorry. :-(\n", NULL); } - out("*** Your MPI job will now abort.\n", NULL); + if(fatal) out("*** Your MPI job will now abort.\n", NULL); } else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) { if (NULL != arg) { out("*** The %s() function was called after MPI_FINALIZE was invoked.\n" @@ -314,7 +374,7 @@ static void backend_fatal_no_aggregate(char *type, "*** Unfortunately, no further information is available on *which* MPI\n" "*** function was invoked, sorry. :-(\n", NULL); } - out("*** Your MPI job will now abort.\n", NULL); + if(fatal) out("*** Your MPI job will now abort.\n", NULL); } else { @@ -365,23 +425,30 @@ static void backend_fatal_no_aggregate(char *type, out("*** Error code: %d (no associated error message)\n", intbuf); } } - /* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */ - out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type); - out("*** and potentially your MPI job)\n", NULL); - + /* out("*** MPI_ERRORS_ABORT: your MPI job will now abort\n", NULL); */ + if(fatal) { + out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type); + out("*** and MPI will try to terminate your MPI job as well)\n", NULL); + } + else { + out("*** MPI_ERRORS_ABORT (processes in this %s will now abort,\n", type); + out("*** and potentially the rest of your MPI job)\n", NULL); + } } va_end(arglist); } -static void backend_fatal(char *type, struct ompi_communicator_t *comm, +static void backend_abort(int fatal, char *type, struct ompi_communicator_t *comm, char *name, int *error_code, va_list arglist) { + int err = MPI_ERR_UNKNOWN; + /* We only want aggregation while the rte is initialized */ if (ompi_rte_initialized) { - backend_fatal_aggregate(type, comm, name, error_code, arglist); + backend_abort_aggregate(fatal, type, comm, name, error_code, arglist); } else { - backend_fatal_no_aggregate(type, comm, name, error_code, arglist); + backend_abort_no_aggregate(fatal, type, comm, name, error_code, arglist); } /* In most instances the communicator will be valid. If not, we are either early in @@ -392,9 +459,9 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm, comm = &ompi_mpi_comm_self.comm; } - if (NULL != error_code) { - ompi_mpi_abort(comm, *error_code); - } else { - ompi_mpi_abort(comm, 1); - } + if (NULL != error_code) + err = *error_code; + + /* Call abort without a specified comm to force RTE Job termination */ + ompi_mpi_abort(fatal? NULL: comm, err); } diff --git a/ompi/errhandler/errhandler_predefined.h b/ompi/errhandler/errhandler_predefined.h index 60fe9f4075..07e306e9a0 100644 --- a/ompi/errhandler/errhandler_predefined.h +++ b/ompi/errhandler/errhandler_predefined.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -35,6 +35,16 @@ OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **f OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win, int *error_code, ...); +/** + * Handler function for MPI_ERRORS_ABORT + */ +OMPI_DECLSPEC void ompi_mpi_errors_abort_comm_handler(struct ompi_communicator_t **comm, + int *error_code, ...); +OMPI_DECLSPEC void ompi_mpi_errors_abort_file_handler(struct ompi_file_t **file, + int *error_code, ...); +OMPI_DECLSPEC void ompi_mpi_errors_abort_win_handler(struct ompi_win_t **win, + int *error_code, ...); + /** * Handler function for MPI_ERRORS_RETURN */ diff --git a/ompi/errhandler/help-mpi-errors.txt b/ompi/errhandler/help-mpi-errors.txt index a6dcf7172e..c2dd034b84 100644 --- a/ompi/errhandler/help-mpi-errors.txt +++ b/ompi/errhandler/help-mpi-errors.txt @@ -3,7 +3,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -28,7 +28,7 @@ %s *** on %s %s %s *** %s %s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort, -%s *** and potentially your MPI job) +%s *** and MPI will try to terminate your MPI job as well) # [mpi_errors_are_fatal unknown handle] %s *** An error occurred %s %s @@ -36,5 +36,13 @@ %s *** on a NULL %s %s *** %s %s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort, -%s *** and potentially your MPI job) +%s *** and MPI will try to terminate your MPI job as well) +# +[mpi_errors_abort] +%s *** An error occurred %s %s +%s *** reported by process [%lu,%lu] +%s *** on %s %s +%s *** %s +%s *** MPI_ERRORS_ABORT (processes in this %s will now abort, +%s *** and potentially the rest of your MPI job) # diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index d04e46010a..092a699164 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -1029,6 +1029,7 @@ OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_c_long_double_co OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errhandler_null; OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_are_fatal; +OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_abort; OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_return; OMPI_DECLSPEC extern struct ompi_predefined_win_t ompi_mpi_win_null; @@ -1225,6 +1226,7 @@ OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_ub; #define MPI_COUNT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_count) #define MPI_ERRORS_ARE_FATAL OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_are_fatal) +#define MPI_ERRORS_ABORT OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_abort) #define MPI_ERRORS_RETURN OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_return) /* Typeclass definition for MPI_Type_match_size */ diff --git a/ompi/include/mpif-values.pl b/ompi/include/mpif-values.pl index 91133fe8f7..8e8a19957f 100755 --- a/ompi/include/mpif-values.pl +++ b/ompi/include/mpif-values.pl @@ -96,6 +96,7 @@ $handles->{MPI_COMM_SELF} = 1; $handles->{MPI_GROUP_EMPTY} = 1; $handles->{MPI_ERRORS_ARE_FATAL} = 1; $handles->{MPI_ERRORS_RETURN} = 2; +$handles->{MPI_ERRORS_ABORT} = 3; $handles->{MPI_MAX} = 1; $handles->{MPI_MIN} = 2; diff --git a/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 b/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 index 46f8ec3ac4..ed3fc9388b 100644 --- a/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 +++ b/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-types.F90 @@ -88,6 +88,7 @@ module mpi_f08_types type(MPI_Group), parameter :: MPI_GROUP_EMPTY = MPI_Group(OMPI_MPI_GROUP_EMPTY) type(MPI_Errhandler), parameter :: MPI_ERRORS_ARE_FATAL = MPI_Errhandler(OMPI_MPI_ERRORS_ARE_FATAL) + type(MPI_Errhandler), parameter :: MPI_ERRORS_ABORT = MPI_Errhandler(OMPI_MPI_ERRORS_ABORT) type(MPI_Errhandler), parameter :: MPI_ERRORS_RETURN = MPI_Errhandler(OMPI_MPI_ERRORS_RETURN) type(MPI_Message), parameter :: MPI_MESSAGE_NO_PROC = MPI_Message(OMPI_MPI_MESSAGE_NO_PROC) diff --git a/ompi/mpi/java/c/mpi_Errhandler.c b/ompi/mpi/java/c/mpi_Errhandler.c index 793bcbbb51..de09b13a61 100644 --- a/ompi/mpi/java/c/mpi_Errhandler.c +++ b/ompi/mpi/java/c/mpi_Errhandler.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -59,6 +59,11 @@ JNIEXPORT jlong JNICALL Java_mpi_Errhandler_getFatal(JNIEnv *env, jclass clazz) return (jlong)MPI_ERRORS_ARE_FATAL; } +JNIEXPORT jlong JNICALL Java_mpi_Errhandler_getAbort(JNIEnv *env, jclass clazz) +{ + return (jlong)MPI_ERRORS_ABORT; +} + JNIEXPORT jlong JNICALL Java_mpi_Errhandler_getReturn(JNIEnv *env, jclass clazz) { return (jlong)MPI_ERRORS_RETURN; diff --git a/ompi/mpi/java/java/MPI.java b/ompi/mpi/java/java/MPI.java index 9ff0482146..3764cf340d 100644 --- a/ompi/mpi/java/java/MPI.java +++ b/ompi/mpi/java/java/MPI.java @@ -141,7 +141,7 @@ public final class MPI MODE_NOSTORE, MODE_NOSUCCEED; public static final int LOCK_EXCLUSIVE, LOCK_SHARED; - public static final Errhandler ERRORS_ARE_FATAL, ERRORS_RETURN; + public static final Errhandler ERRORS_ARE_FATAL, ERRORS_ABORT, ERRORS_RETURN; // Error classes and codes public static final int SUCCESS; @@ -333,6 +333,7 @@ public final class MPI LOCK_SHARED = c.LOCK_SHARED; ERRORS_ARE_FATAL = new Errhandler(Errhandler.getFatal()); + ERRORS_ABORT = new Errhandler(Errhandler.getAbort()); ERRORS_RETURN = new Errhandler(Errhandler.getReturn()); COMM_WORLD = new Intracomm(); diff --git a/ompi/runtime/ompi_mpi_abort.c b/ompi/runtime/ompi_mpi_abort.c index a42109b5de..1c9961215c 100644 --- a/ompi/runtime/ompi_mpi_abort.c +++ b/ompi/runtime/ompi_mpi_abort.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -69,6 +69,7 @@ static bool have_been_invoked = false; * It would be nifty if we could differentiate between the * abort scenarios (but we don't, currently): * - MPI_Abort() + * - MPI_ERRORS_ABORT * - MPI_ERRORS_ARE_FATAL * - Victim of MPI_Abort() */ @@ -182,7 +183,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm, if (state >= OMPI_MPI_STATE_INIT_COMPLETED && state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT && NULL != comm) { - try_kill_peers(comm, errcode); + try_kill_peers(comm, errcode); /* kill only the specified groups, no return if it worked. */ } /* We can fall through to here in a few cases: From e2f53b76fbe183ea11497da51bf4ceb00d2b89f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Tue, 5 May 2020 14:23:47 -0400 Subject: [PATCH 3/3] Add a tester for the ERRORS_ABORT and communicator abort features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller --- test/simple/Makefile | 2 +- test/simple/comm_abort.c | 174 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 test/simple/comm_abort.c diff --git a/test/simple/Makefile b/test/simple/Makefile index 7f7679bbb1..00e76123e8 100644 --- a/test/simple/Makefile +++ b/test/simple/Makefile @@ -1,4 +1,4 @@ -PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn \ +PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort comm_abort simple_spawn \ concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \ bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \ crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \ diff --git a/test/simple/comm_abort.c b/test/simple/comm_abort.c new file mode 100644 index 0000000000..7acb8fad4b --- /dev/null +++ b/test/simple/comm_abort.c @@ -0,0 +1,174 @@ +/* -*- C -*- + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * Test aborting communicators + */ + +#include +#include +#include "mpi.h" + +#define print1(format...) if(0 == rank) printf(format) + + +int main(int argc, char* argv[]) +{ + int rank, size, more; + double start, now; + MPI_Comm comm_pair_fatal, comm_pair_return, comm_pair_abort; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + if(0 == rank && size%2) { + fprintf(stderr, "This test requires an even number of processes\n\n"); + MPI_Abort(MPI_COMM_WORLD, size); + } + + /* Setup: split our world in a set of 2-processes islands */ + MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_fatal); + MPI_Comm_set_errhandler(comm_pair_fatal, MPI_ERRORS_ARE_FATAL); + MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_return); + MPI_Comm_set_errhandler(comm_pair_return, MPI_ERRORS_RETURN); + MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_abort); + /* If this code fails to compile, the MPI implementation is not compliant + * with MPI-4 (TODO: add ref to chapter/line when MPI-4 published). */ + MPI_Comm_set_errhandler(comm_pair_abort, MPI_ERRORS_ABORT); + MPI_Barrier(MPI_COMM_WORLD); + + print1( +"This program will test partial abort functionality (communicator scoped abort).\n" +" Each test will perform a loop of communication on a subcommunicator for about\n" +" 1 second between printouts, and then, a 1 second cooldown.\n"); + + print1("\n\n" +"Test1: MPI_Abort(MPI_COMM_SELF) aborts only one process?\n" +" In a high quality implementation, all ranks except %d\n" +" should report their presence.\n", 1); + if(rank == 1) { + MPI_Abort(MPI_COMM_SELF, 1); + } + /* Spin on communication for 1 second to let time for Abort to have an + * effect, if any. */ + more = 1; start = MPI_Wtime(); + do { + now = MPI_Wtime(); + if(now - start > 1.) more = 0; + if(rank > 1) /* don't reduce on aborted pairs */ + MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal); + } while(more); + printf(" This is rank %d: still kickin after %d MPI_Abort'ed self\n", rank, 1); + + sleep(1); + print1("===============================================================\n"); + + print1("\n\n" +"Test2: MPI_Abort(comm) aborts all processes in comm?\n" +" In a high quality implementation, all ranks except %d--%d\n" +" should report their presence.\n", 1, 3); + if(rank == 3) { + MPI_Abort(comm_pair_return, 2); + } + /* Spin on communication for 1 second to let time for Abort to have an + * effect, if any. */ + more = 1; start = MPI_Wtime(); + do { + now = MPI_Wtime(); + if(now - start > 1.) more = 0; + if(rank > 3) /* don't reduce on aborted pairs */ + MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal); + } while(more); + printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 3, 2, 3); + + /* This process should have aborted, give it an opportunity to do so if no + * async progress: message to self to spin MPI progress. */ + if(rank == 2) { + MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0, + &now, 1, MPI_DOUBLE, 0, 0, + MPI_COMM_SELF, MPI_STATUS_IGNORE); + printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 2); + } + + sleep(1); + print1("===============================================================\n"); + + print1("\n\n" +"Test3: MPI_ERRORS_ABORT aborts all processes in comm?\n" +" In a high quality implementation, all ranks except %d--%d\n" +" should report their presence.\n", 1, 5); + if(rank == 5) { + MPI_Comm_call_errhandler(comm_pair_abort, 3); + } + /* Spin on communication for 1 second to let time for Abort to have an + * effect, if any. */ + more = 1; start = MPI_Wtime(); + do { + now = MPI_Wtime(); + if(now - start > 1.) more = 0; + if(rank > 5) /* don't reduce on aborted pairs */ + MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal); + } while(more); + printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 5, 4, 5); + + /* This process should have aborted, give it an opportunity to do so if no + * async progress: message to self to spin MPI progress. */ + if(rank == 4) { + MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0, + &now, 1, MPI_DOUBLE, 0, 0, + MPI_COMM_SELF, MPI_STATUS_IGNORE); + printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 4); + } + + sleep(1); + print1("===============================================================\n"); + + print1("\n\n"\ +"Test4: Communicating with an aborted process %d returns a good error code?\n" +" In a high quality implementation, rank %d should print an error string;\n" +" In a higher quality implementation the error should be of class\n" +" MPI_ERR_PROC_ABORTED.\n", 1, 0); + if(rank == 0) { + int err, class, slen; + char str[MPI_MAX_ERROR_STRING]; + /* remember, 1 aborted in test1 */ + MPI_Error_class(err, &class); + MPI_Error_string(err, str, &slen); + err = MPI_Recv(&more, 1, MPI_INT, 1, 0, comm_pair_return, MPI_STATUS_IGNORE); + printf(" This is rank %d: Recv(from=%d) returned code=%d: class=%d: %s\n", 0, 1, err, class, str); + } + + sleep(1); + print1("===============================================================\n"); + + print1("\n\n" +"Test5: MPI_ERRORS_ARE_FATAL aborts all processes?\n"); + if(rank == 0) { + MPI_Comm_call_errhandler(comm_pair_fatal, 5); + } + /* Spin on communication for 1 second to let time for Abort to have an + * effect, if any. */ + more = 1; start = MPI_Wtime(); + do { + now = MPI_Wtime(); + if(now - start > 1.) more = 0; + if(rank > 5) /* don't reduce on aborted pairs */ + MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal); + } while(more); + MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0, + &now, 1, MPI_DOUBLE, 0, 0, + MPI_COMM_SELF, MPI_STATUS_IGNORE); + printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", rank); + + /* Should never get there */ + + MPI_Finalize(); + return 0; +}