Cleanup singleton detection and data retrieval
Extend the PMIx modex recv macros to cover the full set of immediate/optional combinations. If PMIx_Init cannot reach a server, then declare the MPI proc to be a singleton. Provide full support for info values via PMIx Catch all the values used in the "info" area of OMPI using data available from PMIx instead of via envars. Update PMIx and PRRTE to sync with their capabilities. PMIx - ensure cleanup of fork/exec children - fix bug in gds/hash that left app info off of list PRRTE - fix multi-app bugs - port setup_child logic from orte - OMPI env changes - set app->first_rank - ensure common hostname across prun, prte, and pmix - Fix "nolocal" support Silence a warning from btl/vader Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
родитель
9ffee9859f
Коммит
6b4fb509e9
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -87,6 +88,7 @@
|
||||
#include "ompi/errhandler/errcode.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/runtime/ompi_rte.h"
|
||||
|
||||
/*
|
||||
* Private functions
|
||||
@ -103,8 +105,6 @@ static int set_f(int keyval, MPI_Fint value);
|
||||
int ompi_attr_create_predefined(void)
|
||||
{
|
||||
int ret;
|
||||
char *univ_size;
|
||||
int usize;
|
||||
|
||||
/* Create all the keyvals */
|
||||
|
||||
@ -138,14 +138,8 @@ int ompi_attr_create_predefined(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* If the universe size is set, then use it. Otherwise default
|
||||
* to the size of MPI_COMM_WORLD */
|
||||
univ_size = getenv("OMPI_UNIVERSE_SIZE");
|
||||
if (NULL == univ_size || (usize = strtol(univ_size, NULL, 0)) <= 0) {
|
||||
ret = set_f(MPI_UNIVERSE_SIZE, ompi_comm_size(MPI_COMM_WORLD));
|
||||
} else {
|
||||
ret = set_f(MPI_UNIVERSE_SIZE, usize);
|
||||
}
|
||||
/* set the universe size */
|
||||
ret = set_f(MPI_UNIVERSE_SIZE, ompi_process_info.univ_size);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
@ -19,6 +19,7 @@
|
||||
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2019 Triad National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -46,11 +47,13 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_getcwd.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/util/info.h"
|
||||
|
||||
#include "ompi/info/info.h"
|
||||
#include "ompi/runtime/mpiruntime.h"
|
||||
#include "ompi/runtime/params.h"
|
||||
#include "ompi/runtime/ompi_rte.h"
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
@ -85,8 +88,7 @@ opal_pointer_array_t ompi_info_f_to_c_table = {{0}};
|
||||
*/
|
||||
int ompi_mpiinfo_init(void)
|
||||
{
|
||||
const char *val;
|
||||
char *cptr;
|
||||
char *cptr, **tmp;
|
||||
|
||||
/* initialize table */
|
||||
|
||||
@ -107,32 +109,33 @@ int ompi_mpiinfo_init(void)
|
||||
/* fill the env info object */
|
||||
|
||||
/* command for this app_context */
|
||||
if (NULL != (cptr = getenv("OMPI_COMMAND"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "command", cptr);
|
||||
}
|
||||
if (NULL != ompi_process_info.command) {
|
||||
tmp = opal_argv_split(ompi_process_info.command, ' ');
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "command", tmp[0]);
|
||||
|
||||
/* space-separated list of argv for this command */
|
||||
if (NULL != (cptr = getenv("OMPI_ARGV"))) {
|
||||
/* space-separated list of argv for this command */
|
||||
if (1 < opal_argv_count(tmp)) {
|
||||
cptr = opal_argv_join(&tmp[1], ' ');
|
||||
} else {
|
||||
cptr = strdup(tmp[0]);
|
||||
}
|
||||
opal_argv_free(tmp);
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "argv", cptr);
|
||||
free(cptr);
|
||||
}
|
||||
|
||||
/* max procs for the entire job */
|
||||
if (NULL != (cptr = getenv("OMPI_MCA_num_procs"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "maxprocs", cptr);
|
||||
/* Open MPI does not support the "soft" option, so set it to maxprocs */
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "soft", cptr);
|
||||
}
|
||||
opal_asprintf(&cptr, "%u", ompi_process_info.num_procs);
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "maxprocs", cptr);
|
||||
/* Open MPI does not support the "soft" option, so set it to maxprocs */
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "soft", cptr);
|
||||
free(cptr);
|
||||
|
||||
/* local host name */
|
||||
val = opal_gethostname();
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "host", val);
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "host", ompi_process_info.nodename);
|
||||
|
||||
/* architecture name */
|
||||
if (NULL != (cptr = getenv("OMPI_MCA_cpu_type"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "arch", cptr);
|
||||
}
|
||||
#ifdef HAVE_SYS_UTSNAME_H
|
||||
else {
|
||||
{
|
||||
struct utsname sysname;
|
||||
uname(&sysname);
|
||||
cptr = sysname.machine;
|
||||
@ -140,12 +143,9 @@ int ompi_mpiinfo_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* initial working dir of this process - only set when
|
||||
* run by mpiexec as we otherwise have no reliable way
|
||||
* of determining the value
|
||||
*/
|
||||
if (NULL != (cptr = getenv("OMPI_MCA_initial_wdir"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "wdir", cptr);
|
||||
/* initial working dir of this process, if provided */
|
||||
if (NULL != ompi_process_info.initial_wdir) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "wdir", ompi_process_info.initial_wdir);
|
||||
}
|
||||
|
||||
/* provide the REQUESTED thread level - may be different
|
||||
@ -172,25 +172,25 @@ int ompi_mpiinfo_init(void)
|
||||
/**** now some OMPI-specific values that other MPIs may not provide ****/
|
||||
|
||||
/* the number of app_contexts in this job */
|
||||
if (NULL != (cptr = getenv("OMPI_NUM_APP_CTX"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_num_apps", cptr);
|
||||
}
|
||||
opal_asprintf(&cptr, "%u", ompi_process_info.num_apps);
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_num_apps", cptr);
|
||||
free(cptr);
|
||||
|
||||
/* space-separated list of first MPI rank of each app_context */
|
||||
if (NULL != (cptr = getenv("OMPI_FIRST_RANKS"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_first_rank", cptr);
|
||||
if (NULL != ompi_process_info.app_ldrs) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_first_rank", ompi_process_info.app_ldrs);
|
||||
}
|
||||
|
||||
/* space-separated list of num procs for each app_context */
|
||||
if (NULL != (cptr = getenv("OMPI_APP_CTX_NUM_PROCS"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_np", cptr);
|
||||
if (NULL != ompi_process_info.app_sizes) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_np", ompi_process_info.app_sizes);
|
||||
}
|
||||
|
||||
/* location of the directory containing any prepositioned files
|
||||
* the user may have requested
|
||||
*/
|
||||
if (NULL != (cptr = getenv("OMPI_FILE_LOCATION"))) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_positioned_file_dir", cptr);
|
||||
if (NULL != ompi_process_info.proc_session_dir) {
|
||||
opal_info_set(&ompi_mpi_info_env.info.super, "ompi_positioned_file_dir", ompi_process_info.proc_session_dir);
|
||||
}
|
||||
|
||||
/* All done */
|
||||
@ -334,9 +334,9 @@ static void info_constructor(ompi_info_t *info)
|
||||
info);
|
||||
info->i_freed = false;
|
||||
|
||||
/*
|
||||
/*
|
||||
* If the user doesn't want us to ever free it, then add an extra
|
||||
* RETAIN here
|
||||
* RETAIN here
|
||||
*/
|
||||
if (ompi_debug_no_free_handles) {
|
||||
OBJ_RETAIN(&(info->super));
|
||||
|
@ -504,23 +504,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
||||
|
||||
OMPI_TIMING_NEXT("initialization");
|
||||
|
||||
/* if we were not externally started, then we need to setup
|
||||
* some envars so the MPI_INFO_ENV can get the cmd name
|
||||
* and argv (but only if the user supplied a non-NULL argv!), and
|
||||
* the requested thread level
|
||||
*/
|
||||
if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
|
||||
opal_setenv("OMPI_COMMAND", argv[0], true, &environ);
|
||||
}
|
||||
if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
|
||||
char *tmp;
|
||||
tmp = opal_argv_join(&argv[1], ' ');
|
||||
opal_setenv("OMPI_ARGV", tmp, true, &environ);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* Setup RTE */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_init(NULL, NULL))) {
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_init(&argc, &argv))) {
|
||||
error = "ompi_mpi_init: ompi_rte_init failed";
|
||||
goto error;
|
||||
}
|
||||
|
@ -67,10 +67,6 @@ bool ompi_singleton = false;
|
||||
|
||||
static pmix_proc_t myprocid;
|
||||
|
||||
static bool added_transport_keys = false;
|
||||
static bool added_num_procs = false;
|
||||
static bool added_app_ctx = false;
|
||||
static char* pre_condition_transports_print(uint64_t *unique_key);
|
||||
static int _setup_top_session_dir(char **sdir);
|
||||
static int _setup_job_session_dir(char **sdir);
|
||||
static int _setup_proc_session_dir(char **sdir);
|
||||
@ -504,13 +500,12 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
int u32, *u32ptr;
|
||||
uint16_t u16, *u16ptr;
|
||||
char **peers=NULL;
|
||||
char *envar, *ev1, *ev2;
|
||||
char *ev1;
|
||||
char *val;
|
||||
size_t i;
|
||||
uint64_t unique_key[2];
|
||||
char *string_key;
|
||||
pmix_value_t pval;
|
||||
pmix_status_t rc;
|
||||
char **tmp;
|
||||
|
||||
u32ptr = &u32;
|
||||
u16ptr = &u16;
|
||||
@ -537,15 +532,17 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
|
||||
/* initialize the selected module */
|
||||
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myprocid, NULL, 0)))) {
|
||||
/* we cannot run - this could be due to being direct launched
|
||||
* without the required PMI support being built, so print
|
||||
* out a help message indicating it */
|
||||
opal_show_help("help-mpi-runtime.txt", "no-pmi", true, PMIx_Error_string(ret));
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
/* if our nspace starts with "singleton", then we are a singleton */
|
||||
if (0 == strncmp(myprocid.nspace, "singleton", strlen("singleton"))) {
|
||||
ompi_singleton = true;
|
||||
/* if we get PMIX_ERR_UNREACH indicating that we cannot reach the
|
||||
* server, then we assume we are operating as a singleton */
|
||||
if (PMIX_ERR_UNREACH == ret) {
|
||||
ompi_singleton = true;
|
||||
} else {
|
||||
/* we cannot run - this could be due to being direct launched
|
||||
* without the required PMI support being built, so print
|
||||
* out a help message indicating it */
|
||||
opal_show_help("help-mpi-runtime.txt", "no-pmi", true, PMIx_Error_string(ret));
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the process name fields - also registers the new nspace */
|
||||
@ -567,23 +564,35 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
}
|
||||
opal_process_info.nodename = ev1; // ev1 is an allocated string
|
||||
}
|
||||
ompi_process_info.nodename = opal_process_info.nodename;
|
||||
pmix_process_info.nodename = opal_process_info.nodename;
|
||||
|
||||
/* get our local rank from PMI */
|
||||
/* get our local rank from PMIx */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_RANK,
|
||||
&pmix_process_info.my_name, &u16ptr, PMIX_UINT16);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
ret = opal_pmix_convert_status(rc);
|
||||
error = "local rank";
|
||||
goto error;
|
||||
if (ompi_singleton) {
|
||||
/* just assume 0 */
|
||||
u16 = 0;
|
||||
} else {
|
||||
ret = opal_pmix_convert_status(rc);
|
||||
error = "local rank";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
pmix_process_info.my_local_rank = u16;
|
||||
|
||||
/* get our node rank from PMI */
|
||||
/* get our node rank from PMIx */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_NODE_RANK,
|
||||
&pmix_process_info.my_name, &u16ptr, PMIX_UINT16);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
u16 = 0;
|
||||
if (ompi_singleton) {
|
||||
/* just assume 0 */
|
||||
u16 = 0;
|
||||
} else {
|
||||
ret = opal_pmix_convert_status(rc);
|
||||
error = "node rank";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
pmix_process_info.my_node_rank = u16;
|
||||
|
||||
@ -593,27 +602,43 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_JOB_SIZE,
|
||||
&pname, &u32ptr, PMIX_UINT32);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
ret = opal_pmix_convert_status(rc);
|
||||
error = "job size";
|
||||
goto error;
|
||||
if (ompi_singleton) {
|
||||
/* just assume 1 */
|
||||
u32 = 1;
|
||||
} else {
|
||||
ret = opal_pmix_convert_status(rc);
|
||||
error = "job size";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
pmix_process_info.num_procs = u32;
|
||||
|
||||
/* push into the environ for pickup in MPI layer for
|
||||
* MPI-3 required info key
|
||||
*/
|
||||
if (NULL == getenv(OPAL_MCA_PREFIX"opal_ess_num_procs")) {
|
||||
opal_asprintf(&ev1, OPAL_MCA_PREFIX"opal_ess_num_procs=%d", pmix_process_info.num_procs);
|
||||
putenv(ev1);
|
||||
added_num_procs = true;
|
||||
/* get universe size */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_UNIV_SIZE,
|
||||
&pname, &u32ptr, PMIX_UINT32);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
if (ompi_singleton) {
|
||||
/* just assume 1 */
|
||||
u32 = 1;
|
||||
} else {
|
||||
/* default to job size */
|
||||
u32 = pmix_process_info.num_procs;
|
||||
}
|
||||
}
|
||||
if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
|
||||
opal_asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", pmix_process_info.num_procs);
|
||||
putenv(ev2);
|
||||
added_app_ctx = true;
|
||||
pmix_process_info.univ_size = u32;
|
||||
|
||||
/* get number of app contexts */
|
||||
pname.jobid = pmix_process_info.my_name.jobid;
|
||||
pname.vpid = OPAL_VPID_WILDCARD;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_JOB_NUM_APPS,
|
||||
&pname, &u32ptr, PMIX_UINT32);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
pmix_process_info.num_apps = u32;
|
||||
} else {
|
||||
pmix_process_info.num_apps = 1;
|
||||
}
|
||||
|
||||
/* get our app number from PMI - ok if not found */
|
||||
/* get our app number from PMIx - ok if not found */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_APPNUM,
|
||||
&pmix_process_info.my_name, &u32ptr, PMIX_UINT32);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
@ -622,8 +647,48 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
pmix_process_info.app_num = 0;
|
||||
}
|
||||
|
||||
/* if more than one app context, get the number of procs and first rank of each */
|
||||
if (1 == pmix_process_info.num_apps) {
|
||||
pmix_process_info.app_ldrs = strdup("0");
|
||||
opal_asprintf(&pmix_process_info.app_sizes, "%u", pmix_process_info.num_procs);
|
||||
} else {
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, "OMPI_APP_SIZES", &pname, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
/* assume it is just us */
|
||||
opal_asprintf(&pmix_process_info.app_sizes, "%u", pmix_process_info.num_procs);
|
||||
} else {
|
||||
pmix_process_info.app_sizes = val;
|
||||
}
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, "OMPI_FIRST_RANKS", &pname, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS != rc) {
|
||||
/* assume it is just us */
|
||||
pmix_process_info.app_ldrs = strdup("0");
|
||||
} else {
|
||||
pmix_process_info.app_ldrs = val;
|
||||
}
|
||||
}
|
||||
|
||||
/* get our command - defaults to our appnum */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_APP_ARGV,
|
||||
&pname, (char**)&ev1, PMIX_STRING);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
pmix_process_info.command = ev1; // ev1 is an allocated string
|
||||
} else if (NULL != pargv) {
|
||||
tmp = *pargv;
|
||||
if (NULL != tmp) {
|
||||
pmix_process_info.command = opal_argv_join(tmp, ' ');
|
||||
}
|
||||
}
|
||||
|
||||
/* get our reincarnation number */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_REINCARNATION,
|
||||
&OPAL_PROC_MY_NAME, &u32ptr, PMIX_UINT32);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
pmix_process_info.reincarnation = u32;
|
||||
}
|
||||
|
||||
/* get the number of local peers - required for wireup of
|
||||
* shared memory BTL */
|
||||
* shared memory BTL, defaults to local node */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_SIZE,
|
||||
&pname, &u32ptr, PMIX_UINT32);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
@ -634,24 +699,6 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* setup transport keys in case the MPI layer needs them -
|
||||
* we can use the jobfam and stepid as unique keys
|
||||
* because they are unique values assigned by the RM
|
||||
*/
|
||||
if (NULL == getenv(OPAL_MCA_PREFIX"opal_precondition_transports")) {
|
||||
unique_key[0] = (pmix_process_info.my_name.jobid & 0xff00) >> 16;
|
||||
unique_key[1] = pmix_process_info.my_name.jobid & 0x00ff;
|
||||
if (NULL == (string_key = pre_condition_transports_print(unique_key))) {
|
||||
OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
opal_asprintf(&envar, OPAL_MCA_PREFIX"opal_precondition_transports=%s", string_key);
|
||||
putenv(envar);
|
||||
added_transport_keys = true;
|
||||
/* cannot free the envar as that messes up our environ */
|
||||
free(string_key);
|
||||
}
|
||||
|
||||
/* retrieve temp directories info */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_TMPDIR, &pname, &val, PMIX_STRING);
|
||||
if (OPAL_SUCCESS == rc && NULL != val) {
|
||||
@ -692,14 +739,24 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
}
|
||||
}
|
||||
|
||||
/* get our initial working directory - defaults to getting the value
|
||||
* for our app */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_WDIR, &pname, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS == rc && NULL != val) {
|
||||
pmix_process_info.initial_wdir = val;
|
||||
val = NULL;
|
||||
}
|
||||
|
||||
/* identify our location */
|
||||
val = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
|
||||
&pmix_process_info.my_name, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS == rc && NULL != val) {
|
||||
pmix_process_info.cpuset = val;
|
||||
pmix_proc_is_bound = true;
|
||||
} else {
|
||||
pmix_process_info.cpuset = NULL;
|
||||
pmix_proc_is_bound = false;
|
||||
}
|
||||
|
||||
/* get our local peers */
|
||||
@ -710,7 +767,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
error = "num local peers";
|
||||
goto error;
|
||||
}
|
||||
/* retrieve the local peers */
|
||||
/* retrieve the local peers - defaults to local node */
|
||||
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS,
|
||||
&pname, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS == rc && NULL != val) {
|
||||
@ -762,11 +819,6 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
opal_argv_free(peers);
|
||||
}
|
||||
|
||||
/* poor attempt to detect we are bound */
|
||||
if (NULL != getenv("SLURM_CPU_BIND_TYPE")) {
|
||||
pmix_proc_is_bound = true;
|
||||
}
|
||||
|
||||
/* set the remaining opal_process_info fields. Note that
|
||||
* the OPAL layer will have initialized these to NULL, and
|
||||
* anyone between us would not have strdup'd the string, so
|
||||
@ -816,19 +868,6 @@ static bool check_file(const char *root, const char *path)
|
||||
|
||||
int ompi_rte_finalize(void)
|
||||
{
|
||||
/* remove the envars that we pushed into environ
|
||||
* so we leave that structure intact
|
||||
*/
|
||||
if (added_transport_keys) {
|
||||
unsetenv(OPAL_MCA_PREFIX"opal_precondition_transports");
|
||||
}
|
||||
if (added_num_procs) {
|
||||
unsetenv(OPAL_MCA_PREFIX"opal_ess_num_procs");
|
||||
}
|
||||
if (added_app_ctx) {
|
||||
unsetenv("OMPI_APP_CTX_NUM_PROCS");
|
||||
}
|
||||
|
||||
/* shutdown pmix */
|
||||
PMIx_Finalize(NULL, 0);
|
||||
|
||||
@ -837,10 +876,43 @@ int ompi_rte_finalize(void)
|
||||
opal_os_dirpath_destroy(pmix_process_info.job_session_dir,
|
||||
false, check_file);
|
||||
free(pmix_process_info.job_session_dir);
|
||||
pmix_process_info.job_session_dir = NULL;
|
||||
}
|
||||
|
||||
free (pmix_process_info.cpuset);
|
||||
pmix_process_info.cpuset = NULL;
|
||||
if (NULL != pmix_process_info.top_session_dir) {
|
||||
free(pmix_process_info.top_session_dir);
|
||||
pmix_process_info.top_session_dir = NULL;
|
||||
}
|
||||
|
||||
if (NULL != pmix_process_info.proc_session_dir) {
|
||||
free(pmix_process_info.proc_session_dir);
|
||||
pmix_process_info.proc_session_dir = NULL;
|
||||
}
|
||||
|
||||
if (NULL != pmix_process_info.app_sizes) {
|
||||
free(pmix_process_info.app_sizes);
|
||||
pmix_process_info.app_sizes = NULL;
|
||||
}
|
||||
|
||||
if (NULL != pmix_process_info.app_ldrs) {
|
||||
free(pmix_process_info.app_ldrs);
|
||||
pmix_process_info.app_ldrs = NULL;
|
||||
}
|
||||
|
||||
if (NULL != pmix_process_info.cpuset) {
|
||||
free(pmix_process_info.cpuset);
|
||||
pmix_process_info.cpuset = NULL;
|
||||
}
|
||||
|
||||
if (NULL != pmix_process_info.command) {
|
||||
free(pmix_process_info.command);
|
||||
pmix_process_info.command = NULL;
|
||||
}
|
||||
|
||||
if (NULL != pmix_process_info.initial_wdir) {
|
||||
free(pmix_process_info.initial_wdir);
|
||||
pmix_process_info.initial_wdir = NULL;
|
||||
}
|
||||
|
||||
/* cleanup our internal nspace hack */
|
||||
opal_pmix_finalize_nspace_tracker();
|
||||
@ -941,72 +1013,6 @@ void ompi_rte_wait_for_debugger(void)
|
||||
PMIx_Deregister_event_handler(handler, NULL, NULL);
|
||||
}
|
||||
|
||||
static char* pre_condition_transports_print(uint64_t *unique_key)
|
||||
{
|
||||
unsigned int *int_ptr;
|
||||
size_t i, j, string_key_len, written_len;
|
||||
char *string_key = NULL, *format = NULL;
|
||||
|
||||
/* string is two 64 bit numbers printed in hex with a dash between
|
||||
* and zero padding.
|
||||
*/
|
||||
string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1;
|
||||
string_key = (char*) malloc(string_key_len);
|
||||
if (NULL == string_key) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
string_key[0] = '\0';
|
||||
written_len = 0;
|
||||
|
||||
/* get a format string based on the length of an unsigned int. We
|
||||
* want to have zero padding for sizeof(unsigned int) * 2
|
||||
* characters -- when printing as a hex number, each byte is
|
||||
* represented by 2 hex characters. Format will contain something
|
||||
* that looks like %08lx, where the number 8 might be a different
|
||||
* number if the system has a different sized long (8 would be for
|
||||
* sizeof(int) == 4)).
|
||||
*/
|
||||
opal_asprintf(&format, "%%0%dx", (int)(sizeof(unsigned int)) * 2);
|
||||
|
||||
/* print the first number */
|
||||
int_ptr = (unsigned int*) &unique_key[0];
|
||||
for (i = 0 ; i < sizeof(uint64_t) / sizeof(unsigned int) ; ++i) {
|
||||
if (0 == int_ptr[i]) {
|
||||
/* inject some energy */
|
||||
for (j=0; j < sizeof(unsigned int); j++) {
|
||||
int_ptr[i] |= j << j;
|
||||
}
|
||||
}
|
||||
snprintf(string_key + written_len,
|
||||
string_key_len - written_len,
|
||||
format, int_ptr[i]);
|
||||
written_len = strlen(string_key);
|
||||
}
|
||||
|
||||
/* print the middle dash */
|
||||
snprintf(string_key + written_len, string_key_len - written_len, "-");
|
||||
written_len = strlen(string_key);
|
||||
|
||||
/* print the second number */
|
||||
int_ptr = (unsigned int*) &unique_key[1];
|
||||
for (i = 0 ; i < sizeof(uint64_t) / sizeof(unsigned int) ; ++i) {
|
||||
if (0 == int_ptr[i]) {
|
||||
/* inject some energy */
|
||||
for (j=0; j < sizeof(unsigned int); j++) {
|
||||
int_ptr[i] |= j << j;
|
||||
}
|
||||
}
|
||||
snprintf(string_key + written_len,
|
||||
string_key_len - written_len,
|
||||
format, int_ptr[i]);
|
||||
written_len = strlen(string_key);
|
||||
}
|
||||
free(format);
|
||||
|
||||
return string_key;
|
||||
}
|
||||
|
||||
static int _setup_top_session_dir(char **sdir)
|
||||
{
|
||||
char *tmpdir;
|
||||
|
@ -243,7 +243,6 @@ typedef uint16_t ompi_local_rank_t;
|
||||
|
||||
typedef struct {
|
||||
opal_process_name_t my_name;
|
||||
char *my_hnp_uri;
|
||||
char *nodename;
|
||||
pid_t pid;
|
||||
char *top_session_dir;
|
||||
@ -254,7 +253,14 @@ typedef struct {
|
||||
int32_t num_local_peers;
|
||||
uint32_t num_procs;
|
||||
uint32_t app_num;
|
||||
uint32_t univ_size;
|
||||
char *app_sizes;
|
||||
char *app_ldrs;
|
||||
char *cpuset;
|
||||
char *command;
|
||||
uint32_t num_apps;
|
||||
char *initial_wdir;
|
||||
uint32_t reincarnation;
|
||||
} pmix_process_info_t;
|
||||
OMPI_DECLSPEC extern pmix_process_info_t pmix_process_info;
|
||||
#define ompi_process_info pmix_process_info
|
||||
|
@ -19,7 +19,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Google, LLC. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2019 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2019-2020 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -125,7 +125,7 @@ struct mca_btl_vader_component_t {
|
||||
char *my_segment; /**< this rank's base pointer */
|
||||
size_t segment_size; /**< size of my_segment */
|
||||
int32_t num_smp_procs; /**< current number of smp procs on this host */
|
||||
int32_t local_rank; /**< current rank index at add_procs() time */
|
||||
opal_atomic_int32_t local_rank; /**< current rank index at add_procs() time */
|
||||
opal_free_list_t vader_frags_eager; /**< free list of vader send frags */
|
||||
opal_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */
|
||||
opal_free_list_t vader_frags_user; /**< free list of small inline frags */
|
||||
|
@ -405,44 +405,6 @@ typedef struct {
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process:
|
||||
*
|
||||
* r - the integer return status from the modex op (int)
|
||||
* s - string key (char*)
|
||||
* p - pointer to the opal_process_name_t of the proc that posted
|
||||
* the data (opal_process_name_t*)
|
||||
* d - pointer to a location wherein the data object
|
||||
* it to be returned (char**)
|
||||
* sz - pointer to a location wherein the number of bytes
|
||||
* in the data object can be returned (size_t)
|
||||
*/
|
||||
#define OPAL_MODEX_RECV_STRING(r, s, p, d, sz) \
|
||||
do { \
|
||||
pmix_proc_t _proc; \
|
||||
pmix_value_t *_kv = NULL; \
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
|
||||
"%s[%s:%d] MODEX RECV STRING FOR PROC %s KEY %s", \
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), (s))); \
|
||||
*(d) = NULL; \
|
||||
*(sz) = 0; \
|
||||
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
|
||||
(r) = PMIx_Get(&(_proc), (s), NULL, 0, &(_kv)); \
|
||||
if (NULL == _kv) { \
|
||||
(r) = PMIX_ERR_NOT_FOUND; \
|
||||
} else if (PMIX_SUCCESS == (r)) { \
|
||||
*(d) = (uint8_t*)_kv->data.bo.bytes; \
|
||||
*(sz) = _kv->data.bo.size; \
|
||||
_kv->data.bo.bytes = NULL; /* protect the data */ \
|
||||
} \
|
||||
if (NULL != _kv) { \
|
||||
PMIX_VALUE_RELEASE(_kv); \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process:
|
||||
@ -484,6 +446,44 @@ typedef struct {
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process:
|
||||
*
|
||||
* r - the integer return status from the modex op (int)
|
||||
* s - string key (char*)
|
||||
* p - pointer to the opal_process_name_t of the proc that posted
|
||||
* the data (opal_process_name_t*)
|
||||
* d - pointer to a location wherein the data object
|
||||
* it to be returned (char**)
|
||||
* sz - pointer to a location wherein the number of bytes
|
||||
* in the data object can be returned (size_t)
|
||||
*/
|
||||
#define OPAL_MODEX_RECV_STRING(r, s, p, d, sz) \
|
||||
do { \
|
||||
pmix_proc_t _proc; \
|
||||
pmix_value_t *_kv = NULL; \
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
|
||||
"%s[%s:%d] MODEX RECV STRING FOR PROC %s KEY %s", \
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), (s))); \
|
||||
*(d) = NULL; \
|
||||
*(sz) = 0; \
|
||||
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
|
||||
(r) = PMIx_Get(&(_proc), (s), NULL, 0, &(_kv)); \
|
||||
if (NULL == _kv) { \
|
||||
(r) = PMIX_ERR_NOT_FOUND; \
|
||||
} else if (PMIX_SUCCESS == (r)) { \
|
||||
*(d) = (uint8_t*)_kv->data.bo.bytes; \
|
||||
*(sz) = _kv->data.bo.size; \
|
||||
_kv->data.bo.bytes = NULL; /* protect the data */ \
|
||||
} \
|
||||
if (NULL != _kv) { \
|
||||
PMIX_VALUE_RELEASE(_kv); \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process:
|
||||
@ -497,24 +497,25 @@ typedef struct {
|
||||
* sz - pointer to a location wherein the number of bytes
|
||||
* in the data object can be returned (size_t)
|
||||
*/
|
||||
#define OPAL_MODEX_RECV(r, s, p, d, sz) \
|
||||
do { \
|
||||
char *_key; \
|
||||
_key = mca_base_component_to_string((s)); \
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
|
||||
"%s[%s:%d] MODEX RECV FOR PROC %s KEY %s", \
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), _key)); \
|
||||
if (NULL == _key) { \
|
||||
OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); \
|
||||
(r) = OPAL_ERR_OUT_OF_RESOURCE; \
|
||||
} else { \
|
||||
OPAL_MODEX_RECV_STRING((r), _key, (p), (d), (sz)); \
|
||||
free(_key); \
|
||||
} \
|
||||
#define OPAL_MODEX_RECV_OPTIONAL(r, s, p, d, sz) \
|
||||
do { \
|
||||
char *_key; \
|
||||
_key = mca_base_component_to_string((s)); \
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
|
||||
"%s[%s:%d] MODEX RECV FOR PROC %s KEY %s", \
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), _key)); \
|
||||
if (NULL == _key) { \
|
||||
OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); \
|
||||
(r) = OPAL_ERR_OUT_OF_RESOURCE; \
|
||||
} else { \
|
||||
OPAL_MODEX_RECV_STRING_OPTIONAL((r), _key, (p), (d), (sz)); \
|
||||
free(_key); \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process:
|
||||
@ -547,6 +548,38 @@ typedef struct {
|
||||
} while(0);
|
||||
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for retrieving modex data
|
||||
* from another process:
|
||||
*
|
||||
* r - the integer return status from the modex op (int)
|
||||
* s - the MCA component that posted the data (mca_base_component_t*)
|
||||
* p - pointer to the opal_process_name_t of the proc that posted
|
||||
* the data (opal_process_name_t*)
|
||||
* d - pointer to a location wherein the data object
|
||||
* it to be returned (char**)
|
||||
* sz - pointer to a location wherein the number of bytes
|
||||
* in the data object can be returned (size_t)
|
||||
*/
|
||||
#define OPAL_MODEX_RECV(r, s, p, d, sz) \
|
||||
do { \
|
||||
char *_key; \
|
||||
_key = mca_base_component_to_string((s)); \
|
||||
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
|
||||
"%s[%s:%d] MODEX RECV FOR PROC %s KEY %s", \
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
OPAL_NAME_PRINT(*(p)), _key)); \
|
||||
if (NULL == _key) { \
|
||||
OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); \
|
||||
(r) = OPAL_ERR_OUT_OF_RESOURCE; \
|
||||
} else { \
|
||||
OPAL_MODEX_RECV_STRING((r), _key, (p), (d), (sz)); \
|
||||
free(_key); \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
#define PMIX_ERROR_LOG(r) \
|
||||
opal_output(0, "[%s:%d] PMIx Error: %s", __FILE__, __LINE__, PMIx_Error_string((r)))
|
||||
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 8b0a8360e7bb11e0ab48698eefb2715e549d4b1e
|
||||
Subproject commit 305150c29f06d8e780b630f0b5992877005ca1dd
|
2
prrte
2
prrte
@ -1 +1 @@
|
||||
Subproject commit bf8d0192740f01cd7c86bda9c887fe3d7064585d
|
||||
Subproject commit 21ccf39445358ab10c6ced09fa1e80f0047c98c2
|
Загрузка…
Ссылка в новой задаче
Block a user