1
1

Update the connect/accept support so we check to see if we have the proper infrastructure and RTE support, including whether we have ompi-server available if the connect/accept spans multiple applications. Print pretty help messages in all cases where we do not have support

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-05-27 10:47:08 -07:00
родитель a6f61132d7
Коммит 9f60cd0fe7
14 изменённых файлов: 115 добавлений и 20 удалений

Просмотреть файл

@ -15,7 +15,7 @@
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science * Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -40,6 +40,7 @@
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "opal/util/opal_getcwd.h" #include "opal/util/opal_getcwd.h"
#include "opal/util/proc.h" #include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "opal/mca/hwloc/base/base.h" #include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/pmix.h" #include "opal/mca/pmix/pmix.h"
@ -112,6 +113,12 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
if (NULL == opal_pmix.publish || NULL == opal_pmix.connect || if (NULL == opal_pmix.publish || NULL == opal_pmix.connect ||
NULL == opal_pmix.unpublish || NULL == opal_pmix.unpublish ||
(NULL == opal_pmix.lookup && NULL == opal_pmix.lookup_nb)) { (NULL == opal_pmix.lookup && NULL == opal_pmix.lookup_nb)) {
/* print a nice message explaining we don't have support */
opal_show_help("help-mpi-runtime.txt", "noconxcpt", true);
return OMPI_ERR_NOT_SUPPORTED;
}
if (!ompi_rte_connect_accept_support(port_string)) {
/* they will have printed the help message */
return OMPI_ERR_NOT_SUPPORTED; return OMPI_ERR_NOT_SUPPORTED;
} }

Просмотреть файл

@ -1,7 +1,7 @@
/* /*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. * Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science * Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -116,6 +116,9 @@ static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * nam
} }
#endif #endif
/* check dynamics support */
OMPI_DECLSPEC bool ompi_rte_connect_accept_support(const char *port);
END_C_DECLS END_C_DECLS
#endif /* MCA_OMPI_RTE_ORTE_H */ #endif /* MCA_OMPI_RTE_ORTE_H */

Просмотреть файл

@ -39,6 +39,7 @@
#include "orte/mca/routed/routed.h" #include "orte/mca/routed/routed.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_data_server.h"
@ -198,3 +199,47 @@ void ompi_rte_wait_for_debugger(void)
opal_pmix.deregister_evhandler(handler, NULL, NULL); opal_pmix.deregister_evhandler(handler, NULL, NULL);
} }
} }
bool ompi_rte_connect_accept_support(const char *port)
{
char *ptr, *tmp;
orte_process_name_t name;
/* were we launched by mpirun, or are we calling
* without a defined port? */
if (NULL == orte_process_info.my_hnp_uri ||
0 == strlen(port)) {
return true;
}
/* is the job family in the port different than my own? */
tmp = strdup(port); // protect input
if (NULL == (ptr = strchr(tmp, ':'))) {
/* this port didn't come from us! */
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
free(tmp);
return false;
}
*ptr = '\0';
if (ORTE_SUCCESS != orte_util_convert_string_to_process_name(&name, tmp)) {
free(tmp);
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
return false;
}
free(tmp);
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(name.jobid)) {
/* same job family, so our infrastructure is adequate */
return true;
}
/* if the job family of the port is different than our own
* and we were launched by mpirun, then we require ompi-server
* support */
if (NULL == orte_data_server_uri) {
/* print a pretty help message */
orte_show_help("help-orterun.txt", "orterun:server-unavailable", true);
return false;
}
return true;
}

Просмотреть файл

@ -12,6 +12,7 @@
# All rights reserved. # All rights reserved.
# Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved. # Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -93,3 +94,13 @@ Open MPI with --enable-heterogeneous.
[no cuda support] [no cuda support]
The user requested CUDA support with the --mca mpi_cuda_support 1 flag The user requested CUDA support with the --mca mpi_cuda_support 1 flag
but the library was not compiled with any support. but the library was not compiled with any support.
#
[noconxcpt]
The user has called an operation involving MPI_Connect and/or MPI_Accept,
but this environment lacks the necessary infrastructure support for
that operation. Open MPI relies on the PMIx_Publish/Lookup (or one of
its predecessors) APIs for this operation.
This typically happens when launching outside of mpirun where the underlying
resource manager does not provide publish/lookup support. One way of solving
the problem is to simply use mpirun to start the application.

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -57,6 +57,7 @@ OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
typedef struct { typedef struct {
opal_event_base_t *evbase; opal_event_base_t *evbase;
int timeout;
} opal_pmix_base_t; } opal_pmix_base_t;
extern opal_pmix_base_t opal_pmix_base; extern opal_pmix_base_t opal_pmix_base;

Просмотреть файл

@ -193,7 +193,12 @@ int opal_pmix_base_exchange(opal_value_t *indat,
info = OBJ_NEW(opal_value_t); info = OBJ_NEW(opal_value_t);
info->key = strdup(OPAL_PMIX_TIMEOUT); info->key = strdup(OPAL_PMIX_TIMEOUT);
info->type = OPAL_INT; info->type = OPAL_INT;
info->data.integer = timeout; if (0 < opal_pmix_base.timeout) {
/* the user has overridden the default */
info->data.integer = opal_pmix_base.timeout;
} else {
info->data.integer = timeout;
}
opal_list_append(&mlist, &info->super); opal_list_append(&mlist, &info->super);
/* if a non-blocking version of lookup isn't /* if a non-blocking version of lookup isn't

Просмотреть файл

@ -47,6 +47,12 @@ static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
(void) mca_base_var_register("opal", "pmix", "base", "collect_data", "Collect all data during modex", (void) mca_base_var_register("opal", "pmix", "base", "collect_data", "Collect all data during modex",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_collect_all_data); MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_collect_all_data);
opal_pmix_base.timeout = -1;
(void) mca_base_var_register("opal", "pmix", "base", "exchange_timeout",
"Time (in seconds) to wait for a data exchange to complete",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_base.timeout);
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

Просмотреть файл

@ -67,7 +67,9 @@ A request has timed out and will therefore fail:
Operation: %s Operation: %s
Your job may terminate as a result of this problem. You may want to Your job may terminate as a result of this problem. You may want to
adjust the MCA parameter pmix_server_max_wait and try again. adjust the MCA parameter pmix_server_max_wait and try again. If this
occurred during a connect/accept operation, you can adjust that time
using the pmix_base_exchange_timeout parameter.
# #
[noroom] [noroom]
A request for an asynchronous runtime operation cannot be fulfilled A request for an asynchronous runtime operation cannot be fulfilled

Просмотреть файл

@ -138,14 +138,6 @@ void pmix_server_register_params(void)
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
&orte_pmix_server_globals.timeout); &orte_pmix_server_globals.timeout);
/* register the URI of the UNIVERSAL data server */
orte_pmix_server_globals.server_uri = NULL;
(void) mca_base_var_register ("orte", "pmix", NULL, "server_uri",
"URI of a session-level keyval server for publish/lookup operations",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
&orte_pmix_server_globals.server_uri);
/* whether or not to wait for the universal server */ /* whether or not to wait for the universal server */
orte_pmix_server_globals.wait_for_server = false; orte_pmix_server_globals.wait_for_server = false;
(void) mca_base_var_register ("orte", "pmix", NULL, "wait_for_server", (void) mca_base_var_register ("orte", "pmix", NULL, "wait_for_server",

Просмотреть файл

@ -59,13 +59,13 @@ static int init_server(void)
/* if the universal server wasn't specified, then we use /* if the universal server wasn't specified, then we use
* our own HNP for that purpose */ * our own HNP for that purpose */
if (NULL == orte_pmix_server_globals.server_uri) { if (NULL == orte_data_server_uri) {
orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP;
} else { } else {
if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) || if (0 == strncmp(orte_data_server_uri, "file", strlen("file")) ||
0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) { 0 == strncmp(orte_data_server_uri, "FILE", strlen("FILE"))) {
/* it is a file - get the filename */ /* it is a file - get the filename */
filename = strchr(orte_pmix_server_globals.server_uri, ':'); filename = strchr(orte_data_server_uri, ':');
if (NULL == filename) { if (NULL == filename) {
/* filename is not correctly formatted */ /* filename is not correctly formatted */
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
@ -121,7 +121,6 @@ static int init_server(void)
* as a background job - e.g., in scripts * as a background job - e.g., in scripts
*/ */
if (orte_pmix_server_globals.wait_for_server) { if (orte_pmix_server_globals.wait_for_server) {
opal_output(0, "WAIT");
/* ping the server */ /* ping the server */
struct timeval timeout; struct timeval timeout;
timeout.tv_sec = orte_pmix_server_globals.timeout; timeout.tv_sec = orte_pmix_server_globals.timeout;
@ -141,8 +140,6 @@ static int init_server(void)
} }
} }
opal_output(0, "SERVER READY");
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -77,6 +77,7 @@ char *orte_coll_transport = NULL;
int orte_mgmt_conduit = -1; int orte_mgmt_conduit = -1;
int orte_coll_conduit = -1; int orte_coll_conduit = -1;
bool orte_no_vm = false; bool orte_no_vm = false;
char *orte_data_server_uri = NULL;
/* ORTE OOB port flags */ /* ORTE OOB port flags */
bool orte_static_ports = false; bool orte_static_ports = false;

Просмотреть файл

@ -457,6 +457,7 @@ ORTE_DECLSPEC extern bool orte_coprocessors_detected;
ORTE_DECLSPEC extern opal_hash_table_t *orte_coprocessors; ORTE_DECLSPEC extern opal_hash_table_t *orte_coprocessors;
ORTE_DECLSPEC extern char *orte_topo_signature; ORTE_DECLSPEC extern char *orte_topo_signature;
ORTE_DECLSPEC extern bool orte_no_vm; ORTE_DECLSPEC extern bool orte_no_vm;
ORTE_DECLSPEC extern char *orte_data_server_uri;
/* ORTE OOB port flags */ /* ORTE OOB port flags */
ORTE_DECLSPEC extern bool orte_static_ports; ORTE_DECLSPEC extern bool orte_static_ports;

Просмотреть файл

@ -788,5 +788,14 @@ int orte_register_params(void)
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_fwd_mpirun_port); &orte_fwd_mpirun_port);
/* register the URI of the UNIVERSAL data server */
orte_data_server_uri = NULL;
(void) mca_base_var_register ("orte", "pmix", NULL, "server_uri",
"URI of a session-level keyval server for publish/lookup operations",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL,
&orte_data_server_uri);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -458,6 +458,21 @@ Error received: %s
Please check to ensure that the requested server matches the actual server Please check to ensure that the requested server matches the actual server
information, and that the server is in operation. information, and that the server is in operation.
# #
[orterun:server-unavailable]
The user has called an operation involving MPI_Connect and/or MPI_Accept
that spans multiple invocations of mpirun. This requires the support of
the ompi-server tool, which must be executing somewhere that can be
accessed by all participants.
Please ensure the tool is running, and provide each mpirun with the MCA
parameter "pmix_server_uri" pointing to it.
#
[orterun:malformedport]
An operation involving MPI_Connect and/or MPI_Accept was called with
an unrecognized port string. This typically happens when passing the
string on a cmd line and failing to properly quote it to protect
against the special characters it includes
#
[orterun:ompi-server-pid-bad] [orterun:ompi-server-pid-bad]
%s was unable to parse the PID of the %s to be used as the ompi-server. %s was unable to parse the PID of the %s to be used as the ompi-server.
The option we were given was: The option we were given was: