1
1

Merge pull request #4133 from rhc54/topic/modex

Optimize discovery of HWLOC topology
Этот коммит содержится в:
Ralph Castain 2017-08-22 21:00:49 -07:00 коммит произвёл GitHub
родитель 50f471e31e e02c39385a
Коммит f6fd699d44
12 изменённых файлов: 197 добавлений и 52 удалений

Просмотреть файл

@ -20,7 +20,7 @@
* All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
@ -151,7 +151,7 @@ int ompi_comm_init(void)
because MPI_COMM_WORLD has some predefined attributes. */
ompi_attr_hash_init(&ompi_mpi_comm_world.comm.c_keyhash);
/* Check for the binding policy used. We are only interested in
/* Check for the binding policy used. We are only interested in
whether mapby-node has been set right now (could be extended later)
and only on MPI_COMM_WORLD, since for all other sub-communicators
it is virtually impossible to identify their layout across nodes
@ -161,9 +161,9 @@ int ompi_comm_init(void)
opal_process_name_t wildcard = {ORTE_PROC_MY_NAME->jobid, OPAL_VPID_WILDCARD};
char *str=NULL;
int rc;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_MAPBY, &wildcard, &str, OPAL_STRING);
if ( 0 == rc ) {
if ( 0 == rc && NULL != str) {
if ( strstr ( str, "BYNODE") ) {
OMPI_COMM_SET_MAPBY_NODE(&ompi_mpi_comm_world.comm);
}

Просмотреть файл

@ -314,14 +314,14 @@ int opal_hwloc_base_get_topology(void)
FILE *file = fopen("/proc/self/maps", "r");
if (file) {
char line[256];
opal_output(opal_hwloc_base_framework.framework_output,
"Dumping /proc/self/maps");
opal_output(0, "Dumping /proc/self/maps");
while (fgets(line, sizeof(line), file) != NULL) {
char *end = strchr(line, '\n');
if (end)
if (end) {
*end = '\0';
opal_output(opal_hwloc_base_framework.framework_output,
line);
}
opal_output(0, "%s", line);
}
fclose(file);
}
@ -338,9 +338,15 @@ int opal_hwloc_base_get_topology(void)
/* if that isn't available, then try to retrieve
* the xml representation from the PMIx data store */
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base getting topology XML string");
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_TOPO,
&wildcard_rank, &val, OPAL_STRING);
"hwloc:base[%s:%d] getting topology XML string",
__FILE__, __LINE__);
#if HWLOC_API_VERSION >= 0x20000
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, OPAL_PMIX_HWLOC_XML_V2,
&wildcard_rank, &val, OPAL_STRING);
#else
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, OPAL_PMIX_HWLOC_XML_V1,
&wildcard_rank, &val, OPAL_STRING);
#endif
} else {
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base PMIx not available");

Просмотреть файл

@ -161,6 +161,47 @@ extern int opal_pmix_base_exchange(opal_value_t *info,
OPAL_LIST_DESTRUCT(&(_ilist)); \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process when we want the PMIx module
* to request it from the server if not found, but do not
* want the server to go find it if the server doesn't
* already have it:
*
* r - the integer return status from the modex op (int)
* s - string key (char*)
* p - pointer to the opal_process_name_t of the proc that posted
* the data (opal_process_name_t*)
* d - pointer to a location wherein the data object
* is to be returned
* t - the expected data type
*/
#define OPAL_MODEX_RECV_VALUE_IMMEDIATE(r, s, p, d, t) \
do { \
opal_value_t *_kv, *_info; \
opal_list_t _ilist; \
opal_output_verbose(1, opal_pmix_verbose_output, \
"%s[%s:%d] MODEX RECV VALUE IMMEDIATE FOR PROC %s KEY %s", \
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, \
OPAL_NAME_PRINT(*(p)), (s)); \
OBJ_CONSTRUCT(&(_ilist), opal_list_t); \
_info = OBJ_NEW(opal_value_t); \
_info->key = strdup(OPAL_PMIX_IMMEDIATE); \
_info->type = OPAL_BOOL; \
_info->data.flag = true; \
opal_list_append(&(_ilist), &(_info)->super); \
if (OPAL_SUCCESS == ((r) = opal_pmix.get((p), (s), &(_ilist), &(_kv)))) { \
if (NULL == _kv) { \
(r) = OPAL_ERR_NOT_FOUND; \
} else { \
(r) = opal_value_unload(_kv, (void**)(d), (t)); \
OBJ_RELEASE(_kv); \
} \
} \
OPAL_LIST_DESTRUCT(&(_ilist)); \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process:

Просмотреть файл

@ -234,6 +234,8 @@ typedef uint32_t pmix_rank_t;
#define PMIX_HWLOC_SHMEM_ADDR "pmix.hwlocaddr" // (size_t) address of HWLOC shared memory segment
#define PMIX_HWLOC_SHMEM_SIZE "pmix.hwlocsize" // (size_t) size of HWLOC shared memory segment
#define PMIX_HWLOC_SHMEM_FILE "pmix.hwlocfile" // (char*) path to HWLOC shared memory file
#define PMIX_HWLOC_XML_V1 "pmix.hwlocxml1" // (char*) XML representation of local topology using HWLOC v1.x format
#define PMIX_HWLOC_XML_V2 "pmix.hwlocxml2" // (char*) XML representation of local topology using HWLOC v2.x format
/* request-related info */
#define PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
@ -347,6 +349,7 @@ typedef uint32_t pmix_rank_t;
#define PMIX_TIME_REMAINING "pmix.time.remaining" // (char*) query number of seconds (uint32_t) remaining in allocation
// for the specified nspace
/* log attributes */
#define PMIX_LOG_STDERR "pmix.log.stderr" // (char*) log string to stderr
#define PMIX_LOG_STDOUT "pmix.log.stdout" // (char*) log string to stdout

Просмотреть файл

@ -568,7 +568,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc,
/* lood for a debugger attach key */
(void)strncpy(wildcard.nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN);
wildcard.rank = PMIX_RANK_WILDCARD;
PMIX_INFO_LOAD(&ginfo, PMIX_IMMEDIATE, NULL, PMIX_BOOL);
PMIX_INFO_LOAD(&ginfo, PMIX_OPTIONAL, NULL, PMIX_BOOL);
if (PMIX_SUCCESS == PMIx_Get(&wildcard, PMIX_DEBUG_STOP_IN_INIT, &ginfo, 1, &val)) {
PMIX_VALUE_FREE(val, 1); // cleanup memory
/* if the value was found, then we need to wait for debugger attach here */

Просмотреть файл

@ -398,6 +398,45 @@ static pmix_status_t process_values(pmix_value_t **v, pmix_cb_t *cb)
return PMIX_SUCCESS;
}
static void infocb(pmix_status_t status,
pmix_info_t *info, size_t ninfo,
void *cbdata,
pmix_release_cbfunc_t release_fn,
void *release_cbdata)
{
pmix_query_caddy_t *cd = (pmix_query_caddy_t*)cbdata;
pmix_value_t *kv = NULL;
pmix_status_t rc;
if (PMIX_SUCCESS == status) {
if (NULL != info) {
/* there should be only one returned value */
if (1 != ninfo) {
rc = PMIX_ERR_INVALID_VAL;
} else {
PMIX_VALUE_CREATE(kv, 1);
if (NULL == kv) {
rc = PMIX_ERR_NOMEM;
} else {
rc = pmix_value_xfer(kv, &info[0].value);
}
}
} else {
rc = PMIX_ERR_NOT_FOUND;
}
} else {
rc = status;
}
if (NULL != cd->valcbfunc) {
cd->valcbfunc(rc, kv, cd->cbdata);
}
PMIX_RELEASE(cd);
PMIX_VALUE_FREE(kv, 1);
if (NULL != release_fn) {
release_fn(release_cbdata);
}
}
static void _getnbfn(int fd, short flags, void *cbdata)
{
pmix_cb_t *cb = (pmix_cb_t*)cbdata;
@ -409,7 +448,9 @@ static void _getnbfn(int fd, short flags, void *cbdata)
char *tmp;
pmix_proc_t proc;
bool optional = false;
bool immediate = false;
struct timeval tv;
pmix_query_caddy_t *cd;
/* cb was passed to us from another thread - acquire it */
PMIX_ACQUIRE_OBJECT(cb);
@ -431,6 +472,11 @@ static void _getnbfn(int fd, short flags, void *cbdata)
cb->info[n].value.data.flag) {
optional = true;
}
} else if (0 == strncmp(cb->info[n].key, PMIX_IMMEDIATE, PMIX_MAX_KEYLEN)) {
if (PMIX_UNDEF == cb->info[n].value.type ||
cb->info[n].value.data.flag) {
immediate = true;
}
} else if (0 == strncmp(cb->info[n].key, PMIX_TIMEOUT, PMIX_MAX_KEYLEN)) {
/* set a timer to kick us out if we don't
* have an answer within their window */
@ -473,6 +519,25 @@ static void _getnbfn(int fd, short flags, void *cbdata)
*/
goto request;
} else {
/* if immediate was given, then we are being directed to
* check with the server even though the caller is looking for
* job-level info. In some cases, a server may elect not
* to provide info at init to save memory */
if (immediate) {
/* the direct modex request doesn't pass a key as it
* was intended to support non-job-level information.
* So instead, we will use the PMIx_Query function
* to request the information */
cd = PMIX_NEW(pmix_query_caddy_t);
cd->cbdata = cb->cbdata;
cd->valcbfunc = cb->cbfunc.valuefn;
PMIX_QUERY_CREATE(cd->queries, 1);
cd->nqueries = 1;
pmix_argv_append_nosize(&cd->queries[0].keys, cb->key);
PMIx_Query_info_nb(cd->queries, 1, infocb, cd);
PMIX_RELEASE(cb);
return;
}
/* we should have had this info, so respond with the error */
goto respond;
}
@ -494,25 +559,25 @@ static void _getnbfn(int fd, short flags, void *cbdata)
respond:
/* if a callback was provided, execute it */
if (NULL != cb->cbfunc.valuefn) {
if (NULL != val) {
/* if this is a compressed string, then uncompress it */
if (PMIX_COMPRESSED_STRING == val->type) {
pmix_util_uncompress_string(&tmp, (uint8_t*)val->data.bo.bytes, val->data.bo.size);
if (NULL == tmp) {
PMIX_ERROR_LOG(PMIX_ERR_NOMEM);
rc = PMIX_ERR_NOMEM;
PMIX_VALUE_RELEASE(val);
val = NULL;
} else {
PMIX_VALUE_DESTRUCT(val);
PMIX_VAL_ASSIGN(val, string, tmp);
}
}
}
cb->cbfunc.valuefn(rc, val, cb->cbdata);
if (NULL != val) {
/* if this is a compressed string, then uncompress it */
if (PMIX_COMPRESSED_STRING == val->type) {
pmix_util_uncompress_string(&tmp, (uint8_t*)val->data.bo.bytes, val->data.bo.size);
if (NULL == tmp) {
PMIX_ERROR_LOG(PMIX_ERR_NOMEM);
rc = PMIX_ERR_NOMEM;
PMIX_VALUE_RELEASE(val);
val = NULL;
} else {
PMIX_VALUE_DESTRUCT(val);
PMIX_VAL_ASSIGN(val, string, tmp);
}
}
}
cb->cbfunc.valuefn(rc, val, cb->cbdata);
}
if (NULL != val) {
PMIX_VALUE_RELEASE(val);
PMIX_VALUE_RELEASE(val);
}
PMIX_RELEASE(cb);
return;

Просмотреть файл

@ -245,6 +245,7 @@ static void qcon(pmix_query_caddy_t *p)
p->info = NULL;
p->ninfo = 0;
p->cbfunc = NULL;
p->valcbfunc = NULL;
p->cbdata = NULL;
p->relcbfunc = NULL;
}

Просмотреть файл

@ -219,6 +219,7 @@ typedef struct {
pmix_info_t *info;
size_t ninfo;
pmix_info_cbfunc_t cbfunc;
pmix_value_cbfunc_t valcbfunc;
pmix_release_cbfunc_t relcbfunc;
void *cbdata;
} pmix_query_caddy_t;

Просмотреть файл

@ -161,6 +161,9 @@ BEGIN_C_DECLS
#define OPAL_PMIX_HWLOC_SHMEM_ADDR "pmix.hwlocaddr" // (size_t) address of HWLOC shared memory segment
#define OPAL_PMIX_HWLOC_SHMEM_SIZE "pmix.hwlocsize" // (size_t) size of HWLOC shared memory segment
#define OPAL_PMIX_HWLOC_SHMEM_FILE "pmix.hwlocfile" // (char*) path to HWLOC shared memory file
#define OPAL_PMIX_HWLOC_XML_V1 "pmix.hwlocxml1" // (char*) XML representation of local topology using HWLOC v1.x format
#define OPAL_PMIX_HWLOC_XML_V2 "pmix.hwlocxml2" // (char*) XML representation of local topology using HWLOC v2.x format
/* request-related info */
#define OPAL_PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation

Просмотреть файл

@ -116,14 +116,14 @@ static int init(void)
FILE *file = fopen("/proc/self/maps", "r");
if (file) {
char line[256];
opal_output(0, orte_rtc_base_framework.framework_output,
"%s Dumping /proc/self/maps", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_output(0, "%s Dumping /proc/self/maps",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
while (fgets(line, sizeof(line), file) != NULL) {
char *end = strchr(line, '\n');
if (end)
if (end) {
*end = '\0';
opal_output(0, orte_rtc_base_framework.framework_output,
"%s", line);
}
opal_output(0, "%s", line);
}
fclose(file);
}

Просмотреть файл

@ -242,24 +242,7 @@ int pmix_server_init(void)
/* ensure the PMIx server uses the proper rendezvous directory */
opal_setenv("PMIX_SERVER_TMPDIR", orte_process_info.proc_session_dir, true, &environ);
/* pass the server the local topology - we do this so the procs won't read the
* topology themselves as this could overwhelm the local
* system on large-scale SMPs */
OBJ_CONSTRUCT(&info, opal_list_t);
if (NULL != opal_hwloc_topology) {
char *xmlbuffer=NULL;
int len;
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
if (0 != opal_hwloc_base_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len)) {
OBJ_RELEASE(kv);
OBJ_DESTRUCT(&info);
return ORTE_ERROR;
}
kv->data.string = xmlbuffer;
kv->type = OPAL_STRING;
opal_list_append(&info, &kv->super);
}
/* tell the server our temp directory */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);

Просмотреть файл

@ -35,6 +35,7 @@
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/hwloc/hwloc-internal.h"
#include "opal/mca/pstat/pstat.h"
#include "orte/mca/errmgr/errmgr.h"
@ -635,6 +636,47 @@ static void _query(int sd, short args, void *cbdata)
} else {
opal_list_append(results, &kv->super);
}
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_HWLOC_XML_V1)) {
if (NULL != opal_hwloc_topology) {
char *xmlbuffer=NULL;
int len;
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_HWLOC_XML_V1);
#if HWLOC_API_VERSION < 0x20000
/* get this from the v1.x API */
if (0 != hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len)) {
OBJ_RELEASE(kv);
continue;
}
#else
/* get it from the v2 API */
if (0 != hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len,
HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1)) {
OBJ_RELEASE(kv);
continue;
}
#endif
kv->data.string = xmlbuffer;
kv->type = OPAL_STRING;
opal_list_append(results, &kv->super);
}
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_HWLOC_XML_V2)) {
/* we cannot provide it if we are using v1.x */
#if HWLOC_API_VERSION >= 0x20000
if (NULL != opal_hwloc_topology) {
char *xmlbuffer=NULL;
int len;
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_HWLOC_XML_V2);
if (0 != hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len, 0)) {
OBJ_RELEASE(kv);
continue;
}
kv->data.string = xmlbuffer;
kv->type = OPAL_STRING;
opal_list_append(results, &kv->super);
}
#endif
}
}
}