Merge pull request #3595 from rhc54/topic/dyn
Update the connect/accept support
Этот коммит содержится в:
Коммит
c99978a767
@ -15,7 +15,7 @@
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -40,6 +40,7 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_getcwd.h"
|
||||
#include "opal/util/proc.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
@ -112,6 +113,12 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
||||
if (NULL == opal_pmix.publish || NULL == opal_pmix.connect ||
|
||||
NULL == opal_pmix.unpublish ||
|
||||
(NULL == opal_pmix.lookup && NULL == opal_pmix.lookup_nb)) {
|
||||
/* print a nice message explaining we don't have support */
|
||||
opal_show_help("help-mpi-runtime.txt", "noconxcpt", true);
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
if (!ompi_rte_connect_accept_support(port_string)) {
|
||||
/* they will have printed the help message */
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -116,6 +116,9 @@ static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * nam
|
||||
}
|
||||
#endif
|
||||
|
||||
/* check dynamics support */
|
||||
OMPI_DECLSPEC bool ompi_rte_connect_accept_support(const char *port);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_OMPI_RTE_ORTE_H */
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_data_server.h"
|
||||
@ -198,3 +199,47 @@ void ompi_rte_wait_for_debugger(void)
|
||||
opal_pmix.deregister_evhandler(handler, NULL, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
bool ompi_rte_connect_accept_support(const char *port)
|
||||
{
|
||||
char *ptr, *tmp;
|
||||
orte_process_name_t name;
|
||||
|
||||
/* were we launched by mpirun, or are we calling
|
||||
* without a defined port? */
|
||||
if (NULL == orte_process_info.my_hnp_uri ||
|
||||
0 == strlen(port)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* is the job family in the port different than my own? */
|
||||
tmp = strdup(port); // protect input
|
||||
if (NULL == (ptr = strchr(tmp, ':'))) {
|
||||
/* this port didn't come from us! */
|
||||
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
|
||||
free(tmp);
|
||||
return false;
|
||||
}
|
||||
*ptr = '\0';
|
||||
if (ORTE_SUCCESS != orte_util_convert_string_to_process_name(&name, tmp)) {
|
||||
free(tmp);
|
||||
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
|
||||
return false;
|
||||
}
|
||||
free(tmp);
|
||||
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(name.jobid)) {
|
||||
/* same job family, so our infrastructure is adequate */
|
||||
return true;
|
||||
}
|
||||
|
||||
/* if the job family of the port is different than our own
|
||||
* and we were launched by mpirun, then we require ompi-server
|
||||
* support */
|
||||
if (NULL == orte_data_server_uri) {
|
||||
/* print a pretty help message */
|
||||
orte_show_help("help-orterun.txt", "orterun:server-unavailable", true);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -12,6 +12,7 @@
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -93,3 +94,13 @@ Open MPI with --enable-heterogeneous.
|
||||
[no cuda support]
|
||||
The user requested CUDA support with the --mca mpi_cuda_support 1 flag
|
||||
but the library was not compiled with any support.
|
||||
#
|
||||
[noconxcpt]
|
||||
The user has called an operation involving MPI_Connect and/or MPI_Accept,
|
||||
but this environment lacks the necessary infrastructure support for
|
||||
that operation. Open MPI relies on the PMIx_Publish/Lookup (or one of
|
||||
its predecessors) APIs for this operation.
|
||||
|
||||
This typically happens when launching outside of mpirun where the underlying
|
||||
resource manager does not provide publish/lookup support. One way of solving
|
||||
the problem is to simply use mpirun to start the application.
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -57,6 +57,7 @@ OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
|
||||
|
||||
typedef struct {
|
||||
opal_event_base_t *evbase;
|
||||
int timeout;
|
||||
} opal_pmix_base_t;
|
||||
|
||||
extern opal_pmix_base_t opal_pmix_base;
|
||||
|
@ -193,7 +193,12 @@ int opal_pmix_base_exchange(opal_value_t *indat,
|
||||
info = OBJ_NEW(opal_value_t);
|
||||
info->key = strdup(OPAL_PMIX_TIMEOUT);
|
||||
info->type = OPAL_INT;
|
||||
info->data.integer = timeout;
|
||||
if (0 < opal_pmix_base.timeout) {
|
||||
/* the user has overridden the default */
|
||||
info->data.integer = opal_pmix_base.timeout;
|
||||
} else {
|
||||
info->data.integer = timeout;
|
||||
}
|
||||
opal_list_append(&mlist, &info->super);
|
||||
|
||||
/* if a non-blocking version of lookup isn't
|
||||
|
@ -47,6 +47,12 @@ static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
|
||||
(void) mca_base_var_register("opal", "pmix", "base", "collect_data", "Collect all data during modex",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_collect_all_data);
|
||||
|
||||
opal_pmix_base.timeout = -1;
|
||||
(void) mca_base_var_register("opal", "pmix", "base", "exchange_timeout",
|
||||
"Time (in seconds) to wait for a data exchange to complete",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_base.timeout);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -67,7 +67,9 @@ A request has timed out and will therefore fail:
|
||||
Operation: %s
|
||||
|
||||
Your job may terminate as a result of this problem. You may want to
|
||||
adjust the MCA parameter pmix_server_max_wait and try again.
|
||||
adjust the MCA parameter pmix_server_max_wait and try again. If this
|
||||
occurred during a connect/accept operation, you can adjust that time
|
||||
using the pmix_base_exchange_timeout parameter.
|
||||
#
|
||||
[noroom]
|
||||
A request for an asynchronous runtime operation cannot be fulfilled
|
||||
|
@ -138,14 +138,6 @@ void pmix_server_register_params(void)
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
||||
&orte_pmix_server_globals.timeout);
|
||||
|
||||
/* register the URI of the UNIVERSAL data server */
|
||||
orte_pmix_server_globals.server_uri = NULL;
|
||||
(void) mca_base_var_register ("orte", "pmix", NULL, "server_uri",
|
||||
"URI of a session-level keyval server for publish/lookup operations",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
|
||||
&orte_pmix_server_globals.server_uri);
|
||||
|
||||
/* whether or not to wait for the universal server */
|
||||
orte_pmix_server_globals.wait_for_server = false;
|
||||
(void) mca_base_var_register ("orte", "pmix", NULL, "wait_for_server",
|
||||
|
@ -59,13 +59,13 @@ static int init_server(void)
|
||||
|
||||
/* if the universal server wasn't specified, then we use
|
||||
* our own HNP for that purpose */
|
||||
if (NULL == orte_pmix_server_globals.server_uri) {
|
||||
if (NULL == orte_data_server_uri) {
|
||||
orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP;
|
||||
} else {
|
||||
if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) ||
|
||||
0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) {
|
||||
if (0 == strncmp(orte_data_server_uri, "file", strlen("file")) ||
|
||||
0 == strncmp(orte_data_server_uri, "FILE", strlen("FILE"))) {
|
||||
/* it is a file - get the filename */
|
||||
filename = strchr(orte_pmix_server_globals.server_uri, ':');
|
||||
filename = strchr(orte_data_server_uri, ':');
|
||||
if (NULL == filename) {
|
||||
/* filename is not correctly formatted */
|
||||
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
|
||||
@ -121,7 +121,6 @@ static int init_server(void)
|
||||
* as a background job - e.g., in scripts
|
||||
*/
|
||||
if (orte_pmix_server_globals.wait_for_server) {
|
||||
opal_output(0, "WAIT");
|
||||
/* ping the server */
|
||||
struct timeval timeout;
|
||||
timeout.tv_sec = orte_pmix_server_globals.timeout;
|
||||
@ -141,8 +140,6 @@ static int init_server(void)
|
||||
}
|
||||
}
|
||||
|
||||
opal_output(0, "SERVER READY");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -77,6 +77,7 @@ char *orte_coll_transport = NULL;
|
||||
int orte_mgmt_conduit = -1;
|
||||
int orte_coll_conduit = -1;
|
||||
bool orte_no_vm = false;
|
||||
char *orte_data_server_uri = NULL;
|
||||
|
||||
/* ORTE OOB port flags */
|
||||
bool orte_static_ports = false;
|
||||
|
@ -457,6 +457,7 @@ ORTE_DECLSPEC extern bool orte_coprocessors_detected;
|
||||
ORTE_DECLSPEC extern opal_hash_table_t *orte_coprocessors;
|
||||
ORTE_DECLSPEC extern char *orte_topo_signature;
|
||||
ORTE_DECLSPEC extern bool orte_no_vm;
|
||||
ORTE_DECLSPEC extern char *orte_data_server_uri;
|
||||
|
||||
/* ORTE OOB port flags */
|
||||
ORTE_DECLSPEC extern bool orte_static_ports;
|
||||
|
@ -788,5 +788,14 @@ int orte_register_params(void)
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_fwd_mpirun_port);
|
||||
|
||||
/* register the URI of the UNIVERSAL data server */
|
||||
orte_data_server_uri = NULL;
|
||||
(void) mca_base_var_register ("orte", "pmix", NULL, "server_uri",
|
||||
"URI of a session-level keyval server for publish/lookup operations",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL,
|
||||
&orte_data_server_uri);
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -458,6 +458,21 @@ Error received: %s
|
||||
Please check to ensure that the requested server matches the actual server
|
||||
information, and that the server is in operation.
|
||||
#
|
||||
[orterun:server-unavailable]
|
||||
The user has called an operation involving MPI_Connect and/or MPI_Accept
|
||||
that spans multiple invocations of mpirun. This requires the support of
|
||||
the ompi-server tool, which must be executing somewhere that can be
|
||||
accessed by all participants.
|
||||
|
||||
Please ensure the tool is running, and provide each mpirun with the MCA
|
||||
parameter "pmix_server_uri" pointing to it.
|
||||
#
|
||||
[orterun:malformedport]
|
||||
An operation involving MPI_Connect and/or MPI_Accept was called with
|
||||
an unrecognized port string. This typically happens when passing the
|
||||
string on a cmd line and failing to properly quote it to protect
|
||||
against the special characters it includes
|
||||
#
|
||||
[orterun:ompi-server-pid-bad]
|
||||
%s was unable to parse the PID of the %s to be used as the ompi-server.
|
||||
The option we were given was:
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user