1
1

mpi: infrastructure to gracefully disable MPI dyn procs

Add ompi_mpi_dynamics_disable() function to disable MPI dynamic
process functionality (i.e., such that if MPI_COMM_SPAWN/etc. are
invoked, you'll get a show_help error explaining that MPI dynamic
process functionality is disabled in this environment -- instead of a
potentially-cryptic network or hardware error).

Fixes 
Этот коммит содержится в:
Jeff Squyres 2015-10-06 15:07:07 -07:00
родитель 5d97d7b5d5
Коммит ac25505e03
15 изменённых файлов: 288 добавлений и 33 удалений

@ -26,8 +26,11 @@
#include "ompi_config.h"
#include <stdio.h>
#include "opal/util/show_help.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
#include "ompi/runtime/mpiruntime.h"
#include "ompi/communicator/communicator.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/info/info.h"
@ -89,6 +92,10 @@ int MPI_Comm_accept(const char *port_name, MPI_Info info, int root,
}
}
if (!ompi_mpi_dynamics_is_enabled(FUNC_NAME)) {
return OMPI_ERRHANDLER_INVOKE(comm, OMPI_ERR_NOT_SUPPORTED, FUNC_NAME);
}
/* parse info object. no prefedined values for this function in MPI-2
* so lets ignore it for the moment.
* if ( rank == root && MPI_INFO_NULL != info ) {
@ -107,6 +114,14 @@ int MPI_Comm_accept(const char *port_name, MPI_Info info, int root,
OPAL_CR_EXIT_LIBRARY();
if (OPAL_ERR_NOT_SUPPORTED == rc) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support accept/connect functionality");
}
*newcomm = newcomp;
OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME );
}

@ -26,8 +26,11 @@
#include "ompi_config.h"
#include <stdio.h>
#include "opal/util/show_help.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
#include "ompi/runtime/mpiruntime.h"
#include "ompi/communicator/communicator.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/info/info.h"
@ -89,6 +92,10 @@ int MPI_Comm_connect(const char *port_name, MPI_Info info, int root,
}
}
if (!ompi_mpi_dynamics_is_enabled(FUNC_NAME)) {
return OMPI_ERRHANDLER_INVOKE(comm, OMPI_ERR_NOT_SUPPORTED, FUNC_NAME);
}
/* parse info object. No prefedined values for this function in MPI-2,
* so lets ignore it for the moment.
*
@ -109,6 +116,14 @@ int MPI_Comm_connect(const char *port_name, MPI_Info info, int root,
OPAL_CR_EXIT_LIBRARY();
if (OPAL_ERR_NOT_SUPPORTED == rc) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support accept/connect functionality");
}
*newcomm = newcomp;
OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME);
}

@ -37,8 +37,11 @@
#include <netinet/in.h>
#endif
#include "opal/util/show_help.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
#include "ompi/runtime/mpiruntime.h"
#include "ompi/communicator/communicator.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/dpm/dpm.h"
@ -75,6 +78,11 @@ int MPI_Comm_join(int fd, MPI_Comm *intercomm)
}
}
if (!ompi_mpi_dynamics_is_enabled(FUNC_NAME)) {
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, OMPI_ERR_NOT_SUPPORTED,
FUNC_NAME);
}
OPAL_CR_ENTER_LIBRARY();
/* send my process name */
@ -114,8 +122,7 @@ int MPI_Comm_join(int fd, MPI_Comm *intercomm)
if (send_first) {
/* open a port */
if (OMPI_SUCCESS != (rc = ompi_dpm_open_port(port_name))) {
OPAL_CR_EXIT_LIBRARY();
return rc;
goto error;
}
llen = (uint32_t)(strlen(port_name)+1);
len = htonl(llen);
@ -133,6 +140,18 @@ int MPI_Comm_join(int fd, MPI_Comm *intercomm)
OPAL_CR_EXIT_LIBRARY();
*intercomm = newcomp;
error:
OPAL_CR_EXIT_LIBRARY();
if (OPAL_ERR_NOT_SUPPORTED == rc) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support join functionality");
}
OMPI_ERRHANDLER_RETURN (rc, MPI_COMM_SELF, rc, FUNC_NAME);
}

@ -26,9 +26,12 @@
#include "ompi_config.h"
#include <stdio.h>
#include "opal/util/show_help.h"
#include "ompi/info/info.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
#include "ompi/runtime/mpiruntime.h"
#include "ompi/communicator/communicator.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/dpm/dpm.h"
@ -97,6 +100,10 @@ int MPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info inf
}
}
if (!ompi_mpi_dynamics_is_enabled(FUNC_NAME)) {
return OMPI_ERRHANDLER_INVOKE(comm, OMPI_ERR_NOT_SUPPORTED, FUNC_NAME);
}
/* initialize the port name to avoid problems */
memset(port_name, 0, MPI_MAX_PORT_NAME);
@ -132,6 +139,14 @@ int MPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info inf
}
error:
if (OPAL_ERR_NOT_SUPPORTED == rc) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support spawn functionality");
}
/* close the port */
if (rank == root && !non_mpi) {
ompi_dpm_close_port(port_name);

@ -26,8 +26,11 @@
#include "ompi_config.h"
#include <stdio.h>
#include "opal/util/show_help.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
#include "ompi/runtime/mpiruntime.h"
#include "ompi/communicator/communicator.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/info/info.h"
@ -130,6 +133,10 @@ int MPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_o
}
}
if (!ompi_mpi_dynamics_is_enabled(FUNC_NAME)) {
return OMPI_ERRHANDLER_INVOKE(comm, OMPI_ERR_NOT_SUPPORTED, FUNC_NAME);
}
if (rank == root) {
if (MPI_INFO_NULL == array_of_info[0]) {
non_mpi = false;
@ -173,6 +180,14 @@ int MPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_o
}
error:
if (OPAL_ERR_NOT_SUPPORTED == rc) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support spawn functionality");
}
/* close the port */
if (rank == root && !non_mpi) {
ompi_dpm_close_port(port_name);

@ -27,6 +27,7 @@
#include "opal/class/opal_list.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/util/show_help.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
@ -69,6 +70,17 @@ int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name)
}
}
if (NULL == opal_pmix.lookup) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support name lookup functionality");
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD,
OMPI_ERR_NOT_SUPPORTED,
FUNC_NAME);
}
OPAL_CR_ENTER_LIBRARY();
OBJ_CONSTRUCT(&pinfo, opal_list_t);
@ -111,9 +123,19 @@ int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name)
if (OPAL_SUCCESS != ret ||
OPAL_STRING != pdat->value.type ||
NULL == pdat->value.data.string) {
if (OPAL_ERR_NOT_SUPPORTED == ret) {
ret = OMPI_ERR_NOT_SUPPORTED;
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support name lookup functionality");
} else {
ret = MPI_ERR_NAME;
}
OPAL_CR_EXIT_LIBRARY();
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_NAME,
FUNC_NAME);
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, ret, FUNC_NAME);
}
strncpy ( port_name, pdat->value.data.string, MPI_MAX_PORT_NAME );

@ -13,7 +13,6 @@
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
*
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
@ -28,6 +27,7 @@
#include "opal/class/opal_list.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/util/show_help.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
@ -71,6 +71,17 @@ int MPI_Publish_name(const char *service_name, MPI_Info info,
}
}
if (NULL == opal_pmix.publish) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support name publishing functionality");
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD,
OMPI_ERR_NOT_SUPPORTED,
FUNC_NAME);
}
OPAL_CR_ENTER_LIBRARY();
OBJ_CONSTRUCT(&values, opal_list_t);
@ -149,15 +160,20 @@ int MPI_Publish_name(const char *service_name, MPI_Info info,
if ( OPAL_SUCCESS != rc ) {
if (OPAL_EXISTS == rc) {
/* already exists - can't publish it */
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_FILE_EXISTS,
FUNC_NAME);
rc = MPI_ERR_FILE_EXISTS;
} else if (OPAL_ERR_NOT_SUPPORTED == rc) {
/* this PMIX environment doesn't support publishing */
rc = OMPI_ERR_NOT_SUPPORTED;
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support name publishing functionality");
} else {
rc = MPI_ERR_INTERN;
}
/* none of the MPI-specific errors occurred - must be some
* kind of internal error
*/
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_INTERN,
FUNC_NAME);
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, rc, FUNC_NAME);
}
return MPI_SUCCESS;

@ -15,6 +15,7 @@
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,6 +28,7 @@
#include "opal/class/opal_list.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "ompi/mpi/c/bindings.h"
#include "ompi/runtime/params.h"
@ -71,6 +73,17 @@ int MPI_Unpublish_name(const char *service_name, MPI_Info info,
}
}
if (NULL == opal_pmix.publish) {
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support name publishing functionality");
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD,
OMPI_ERR_NOT_SUPPORTED,
FUNC_NAME);
}
OPAL_CR_ENTER_LIBRARY();
OBJ_CONSTRUCT(&pinfo, opal_list_t);
@ -110,23 +123,24 @@ int MPI_Unpublish_name(const char *service_name, MPI_Info info,
if ( OPAL_SUCCESS != rc ) {
if (OPAL_ERR_NOT_FOUND == rc) {
/* service couldn't be found */
OPAL_CR_EXIT_LIBRARY();
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_SERVICE,
FUNC_NAME);
}
if (OPAL_ERR_PERM == rc) {
rc = MPI_ERR_SERVICE;
} else if (OPAL_ERR_PERM == rc) {
/* this process didn't own the specified service */
OPAL_CR_EXIT_LIBRARY();
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ACCESS,
FUNC_NAME);
rc = MPI_ERR_ACCESS;
} else if (OPAL_ERR_NOT_SUPPORTED == rc) {
/* this PMIX environment doesn't support publishing */
rc = OMPI_ERR_NOT_SUPPORTED;
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
FUNC_NAME,
"Underlying runtime environment does not support name publishing functionality");
} else {
rc = MPI_ERR_INTERN;
}
/* none of the MPI-specific errors occurred - must be some
* kind of internal error
*/
OPAL_CR_EXIT_LIBRARY();
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_INTERN,
FUNC_NAME);
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, rc, FUNC_NAME);
}
OPAL_CR_EXIT_LIBRARY();

@ -2,7 +2,7 @@
#
# Copyright (c) 2006 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -12,8 +12,10 @@
# This is the US/English general help file for Open MPI.
#
[mpi-function-after-finalize]
Calling any MPI-function after calling MPI_Finalize is erroneous.
The only exceptions are MPI_Initialized, MPI_Finalized and MPI_Get_version.
Calling most MPI functions after calling MPI_Finalize is erroneous.
There are a small number of exceptions, such as MPI_Initialized,
MPI_Finalized, and MPI_Get_version.
#
[mpi-initialize-twice]
Calling MPI_Init or MPI_Init_thread twice is erroneous.
@ -25,3 +27,10 @@ with errorcode %d.
NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes.
You may or may not see output from other processes, depending on
exactly when Open MPI kills them.
#
[MPI function not supported]
Your application has invoked an MPI function that is not supported in
this environment.
MPI function: %s
Reason: %s

@ -9,7 +9,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
@ -30,6 +30,7 @@ headers += \
libmpi_la_SOURCES += \
runtime/ompi_mpi_abort.c \
runtime/ompi_mpi_dynamics.c \
runtime/ompi_mpi_init.c \
runtime/ompi_mpi_finalize.c \
runtime/ompi_mpi_params.c \

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
@ -191,6 +191,37 @@ OMPI_DECLSPEC int ompi_mpi_abort(struct ompi_communicator_t* comm,
*/
int ompi_init_preconnect_mpi(void);
/**
* Called to disable MPI dynamic process support. It should be called
* by transports and/or environments where MPI dynamic process
* functionality cannot be supported, and provide a string indicating
* why the functionality is disabled (because it will be shown in a
* user help message). For example, "<TRANSPORT> does not support MPI
* dynamic process functionality."
*
* This first-order functionality is fairly coarse-grained and simple:
* it presents a friendly show-help message to tell users why their
* MPI dynamic process functionality failed (vs. a potentially-cryptic
* network or hardware failure message).
*
* Someone may choose to implement a more fine-grained approach in the
* future.
*/
void ompi_mpi_dynamics_disable(const char *msg);
/**
* Called by the MPI dynamic process functions (e.g., MPI_Comm_spawn)
* to see if MPI dynamic process support is enabled. If it's not,
* this function will opal_show_help() a message and return false.
*/
bool ompi_mpi_dynamics_is_enabled(const char *function);
/**
* Clean up memory / resources by the MPI dynamics process
* functionality checker
*/
void ompi_mpi_dynamics_finalize(void);
END_C_DECLS
#endif /* OMPI_MPI_MPIRUNTIME_H */

64
ompi/runtime/ompi_mpi_dynamics.c Обычный файл

@ -0,0 +1,64 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/util/show_help.h"
#include "ompi/runtime/params.h"
#include "ompi/runtime/mpiruntime.h"
static char *ompi_mpi_dynamics_disabled_msg = "Enabled";
void ompi_mpi_dynamics_disable(const char *msg)
{
assert(msg);
ompi_mpi_dynamics_enabled = false;
ompi_mpi_dynamics_disabled_msg = strdup(msg);
}
bool ompi_mpi_dynamics_is_enabled(const char *function)
{
if (ompi_mpi_dynamics_enabled) {
return true;
}
opal_show_help("help-mpi-api.txt",
"MPI function not supported",
true,
function,
ompi_mpi_dynamics_disabled_msg);
return false;
}
void ompi_mpi_dynamics_finalize(void)
{
// If dynamics were disabled, then we have a message to free
if (!ompi_mpi_dynamics_enabled) {
free(ompi_mpi_dynamics_disabled_msg);
ompi_mpi_dynamics_disabled_msg = NULL;
}
}

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006 University of Houston. All rights reserved.
@ -423,6 +423,10 @@ int ompi_mpi_finalize(void)
ompi_mpi_main_thread = NULL;
}
/* Clean up memory/resources from the MPI dynamic process
functionality checker */
ompi_mpi_dynamics_finalize();
/* Leave the RTE */
if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) {

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
@ -65,6 +65,7 @@ char *ompi_mpi_show_mca_params_string = NULL;
bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
bool ompi_mpi_preconnect_mpi = false;
uint32_t ompi_add_procs_cutoff = 1024;
bool ompi_mpi_dynamics_enabled = true;
static bool show_default_mca_params = false;
static bool show_file_mca_params = false;
@ -299,6 +300,14 @@ int ompi_mpi_register_params(void)
&ompi_add_procs_cutoff);
ompi_mpi_dynamics_enabled = true;
(void) mca_base_var_register("ompi", "mpi", NULL, "dynamics_enabled",
"Is the MPI dynamic process functionality enabled (e.g., MPI_COMM_SPAWN)? Default is yes, but certain transports and/or environments may disable it.",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mpi_dynamics_enabled);
return OMPI_SUCCESS;
}

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
@ -134,6 +134,12 @@ OMPI_DECLSPEC extern uint32_t ompi_direct_modex_cutoff;
*/
OMPI_DECLSPEC extern uint32_t ompi_add_procs_cutoff;
/**
* Whether anything in the code base has disabled MPI dynamic process
* functionality or not
*/
OMPI_DECLSPEC extern bool ompi_mpi_dynamics_enabled;
/**
* Register MCA parameters used by the MPI layer.
*