From 3cd85a9ec5b68ca5d3000ed6bf94ace4a90c36b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bouteiller?= Date: Tue, 9 Jun 2020 08:25:28 -0400 Subject: [PATCH] Add the initial_errhandler info key to MPI_INFO_ENV and populate the value from prun populated paremeters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bouteiller Allow errhandlers to invoke the initial error handler before MPI_INIT Signed-off-by: Aurelien Bouteiller Indentation Signed-off-by: Aurelien Bouteiller --- ompi/errhandler/errhandler.c | 58 ++++++++++++++++++++++++++++++++++-- ompi/errhandler/errhandler.h | 20 +++++++++++++ ompi/info/info.c | 6 ++++ ompi/runtime/ompi_mpi_init.c | 12 ++++---- ompi/runtime/ompi_rte.c | 5 ++++ opal/util/proc.c | 3 +- opal/util/proc.h | 1 + 7 files changed, 95 insertions(+), 10 deletions(-) diff --git a/ompi/errhandler/errhandler.c b/ompi/errhandler/errhandler.c index 50d4c60fd5..6af34c5a42 100644 --- a/ompi/errhandler/errhandler.c +++ b/ompi/errhandler/errhandler.c @@ -78,6 +78,55 @@ ompi_predefined_errhandler_t ompi_mpi_errors_throw_exceptions = {{{0}}}; ompi_predefined_errhandler_t *ompi_mpi_errors_throw_exceptions_addr = &ompi_mpi_errors_throw_exceptions; +static opal_mutex_t errhandler_init_lock = OPAL_MUTEX_STATIC_INIT; +ompi_errhandler_t* ompi_initial_error_handler_eh = NULL; +void (*ompi_initial_error_handler)(struct ompi_communicator_t **comm, int *error_code, ...) = NULL; + +/* + * Initialize the initial errhandler infrastructure only. + * This does not allocate any memory and does not require a corresponding fini. + */ +int ompi_initial_errhandler_init(void) { + opal_mutex_lock(&errhandler_init_lock); + if ( NULL != ompi_initial_error_handler ) { + /* Already initialized (presumably by an API call before MPI_init) */ + opal_mutex_unlock(&errhandler_init_lock); + return OMPI_SUCCESS; + } + + /* If it has been requested from the launch keys, set the initial + * error handler that will be attached by default with predefined + * communicators. We use an env because that can be obtained before + * OPAL and PMIx initialization. + */ + char *env = getenv("OMPI_MCA_mpi_initial_errhandler"); + if( NULL != env ) { + if( 0 == strcasecmp(env, "mpi_errors_are_fatal") ) { + ompi_initial_error_handler = &ompi_mpi_errors_are_fatal_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_are_fatal.eh; + } + else if( 0 == strcasecmp(env, "mpi_errors_abort") ) { + ompi_initial_error_handler = &ompi_mpi_errors_abort_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_abort.eh; + } + else if( 0 == strcasecmp(env, "mpi_errors_return") ) { + ompi_initial_error_handler = &ompi_mpi_errors_return_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_return.eh; + } + else { + /* invalid entry detected, ignore it, set fatal by default */ + opal_output(0, "WARNING: invalid value for launch key 'mpi_initial_errhandler'; defaulting to 'mpi_errors_are_fatal'."); + ompi_initial_error_handler = &ompi_mpi_errors_are_fatal_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_are_fatal.eh; + } + } + else { + ompi_initial_error_handler = &ompi_mpi_errors_are_fatal_comm_handler; + ompi_initial_error_handler_eh = &ompi_mpi_errors_are_fatal.eh; + } + opal_mutex_unlock(&errhandler_init_lock); + return OMPI_SUCCESS; +} /* * Initialize OMPI errhandler infrastructure @@ -163,9 +212,12 @@ int ompi_errhandler_init(void) "MPI_ERRORS_THROW_EXCEPTIONS", sizeof(ompi_mpi_errors_throw_exceptions.eh.eh_name)); - /* All done */ - - return OMPI_SUCCESS; + /* Lets initialize the initial error handler if not already done */ + char *env = getenv("OMPI_MCA_mpi_initial_errhandler"); + if( NULL != env ) { + ompi_process_info.initial_errhandler = strndup(env, MPI_MAX_INFO_VAL); + } + return ompi_initial_errhandler_init(); } diff --git a/ompi/errhandler/errhandler.h b/ompi/errhandler/errhandler.h index 1df48c32a4..139740089f 100644 --- a/ompi/errhandler/errhandler.h +++ b/ompi/errhandler/errhandler.h @@ -185,6 +185,26 @@ OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_throw_exceptio */ OMPI_DECLSPEC extern opal_pointer_array_t ompi_errhandler_f_to_c_table; +/** + * This function selects the initial error handler. + * It may be called during MPI_INIT, or during the first MPI call + * that raises an error. This function does not allocate memory, + * and will only populate the ompi_initial_error_handler_eh and + * ompi_initial_error_handler pointers with predefined error handler + * and error handler functions aliases. + */ +OMPI_DECLSPEC int ompi_initial_errhandler_init(void); +/** + * The initial error handler pointer. Will be set to alias one of the + * predefined error handlers through launch keys during the first MPI call, + * and will then be attached to predefined communicators. + */ +OMPI_DECLSPEC extern ompi_errhandler_t* ompi_initial_error_handler_eh; +/** + * The initial error handler function pointer. Will be called when an error + * is raised before MPI_INIT or after MPI_FINALIZE. + */ +OMPI_DECLSPEC extern void (*ompi_initial_error_handler)(struct ompi_communicator_t **comm, int *error_code, ...); /** * Forward declaration so that we don't have to include diff --git a/ompi/info/info.c b/ompi/info/info.c index c5bc171f7a..ba51bdc2d3 100644 --- a/ompi/info/info.c +++ b/ompi/info/info.c @@ -131,6 +131,12 @@ int ompi_mpiinfo_init(void) opal_info_set(&ompi_mpi_info_env.info.super, "soft", cptr); free(cptr); + /* the initial error handler, set it as requested (nothing if not + * requested) */ + if (NULL != ompi_process_info.initial_errhandler) { + opal_info_set(&ompi_mpi_info_env.info.super, "mpi_initial_errhandler", ompi_process_info.initial_errhandler); + } + /* local host name */ opal_info_set(&ompi_mpi_info_env.info.super, "host", ompi_process_info.nodename); diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 17d7186400..62f689df76 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -751,12 +751,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, goto error; } - /* initialize info */ - if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) { - error = "ompi_info_init() failed"; - goto error; - } - /* initialize error handlers */ if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) { error = "ompi_errhandler_init() failed"; @@ -775,6 +769,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, goto error; } + /* initialize info */ + if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) { + error = "ompi_info_init() failed"; + goto error; + } + /* initialize groups */ if (OMPI_SUCCESS != (ret = ompi_group_init())) { error = "ompi_group_init() failed"; diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index bad581cf25..b8f5932651 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -935,6 +935,11 @@ int ompi_rte_finalize(void) opal_process_info.initial_wdir = NULL; } + if (NULL != opal_process_info.initial_errhandler) { + free(opal_process_info.initial_errhandler); + opal_process_info.initial_errhandler = NULL; + } + /* cleanup our internal nspace hack */ opal_pmix_finalize_nspace_tracker(); diff --git a/opal/util/proc.c b/opal/util/proc.c index 05b2bbea7a..26973fdd61 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -51,7 +51,8 @@ opal_process_info_t opal_process_info = { .num_apps = 0, .initial_wdir = NULL, .reincarnation = 0, - .proc_is_bound = false + .proc_is_bound = false, + .initial_errhandler = NULL, }; static opal_proc_t opal_local_proc = { diff --git a/opal/util/proc.h b/opal/util/proc.h index c7b5928794..785a6f7ec9 100644 --- a/opal/util/proc.h +++ b/opal/util/proc.h @@ -126,6 +126,7 @@ typedef struct opal_process_info_t { char *initial_wdir; uint32_t reincarnation; bool proc_is_bound; + char *initial_errhandler; } opal_process_info_t; OPAL_DECLSPEC extern opal_process_info_t opal_process_info;