1
1

Merge pull request #890 from rhc54/topic/fixpmi

Revert "Revert "Fix the handling of cpusets so we get the correct cpu…
Этот коммит содержится в:
rhc54 2015-09-11 09:25:24 -07:00
родитель 898a0a038c dc5796b8a1
Коммит c31093ff19
14 изменённых файлов: 230 добавлений и 126 удалений

Просмотреть файл

@ -140,7 +140,7 @@ static int ompi_proc_complete_init_single (ompi_proc_t *proc)
/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
} else {
@ -149,10 +149,10 @@ static int ompi_proc_complete_init_single (ompi_proc_t *proc)
/* we can retrieve the hostname at no cost because it
* was provided at startup */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
(char**)&(proc->super.proc_hostname), OPAL_STRING);
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
(char**)&(proc->super.proc_hostname), OPAL_STRING);
if (OPAL_SUCCESS != ret) {
return ret;
return ret;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture - this might force a modex except
@ -345,7 +345,7 @@ int ompi_proc_complete_init(void)
/* the runtime is required to fill in locality for all local processes by this
* point. only local processes will have locality set */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16);
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS == ret) {
locality = u16;
}

Просмотреть файл

@ -61,9 +61,10 @@ static int cray_fence_nb(opal_list_t *procs, int collect_data,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
static int cray_commit(void);
static int cray_get(const opal_process_name_t *id,
const char *key,
const char *key, opal_list_t *info,
opal_value_t **kv);
static int cray_get_nb(const opal_process_name_t *id, const char *key,
opal_list_t *info,
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
static int cray_publish(opal_list_t *info);
static int cray_publish_nb(opal_list_t *info,
@ -735,7 +736,7 @@ static int cray_fence_nb(opal_list_t *procs, int collect_data,
return OPAL_ERR_NOT_IMPLEMENTED;
}
static int cray_get(const opal_process_name_t *id, const char *key, opal_value_t **kv)
static int cray_get(const opal_process_name_t *id, const char *key, opal_list_t *info, opal_value_t **kv)
{
int rc;
opal_list_t vals;
@ -762,7 +763,7 @@ static int cray_get(const opal_process_name_t *id, const char *key, opal_value_t
}
static int cray_get_nb(const opal_process_name_t *id, const char *key,
opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
opal_list_t *info, opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
{
return OPAL_ERR_NOT_IMPLEMENTED;
}

Просмотреть файл

@ -108,6 +108,43 @@ extern int opal_pmix_verbose_output;
free(_key); \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process when we don't want the PMIx module
* to request it from the server if not found:
*
* r - the integer return status from the modex op (int)
* s - string key (char*)
* p - pointer to the opal_process_name_t of the proc that posted
* the data (opal_process_name_t*)
* d - pointer to a location wherein the data object
* is to be returned
* t - the expected data type
*/
#define OPAL_MODEX_RECV_VALUE_OPTIONAL(r, s, p, d, t) \
do { \
opal_value_t *_kv, *_info; \
opal_list_t _ilist; \
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
"%s[%s:%d] MODEX RECV VALUE OPTIONAL FOR PROC %s KEY %s", \
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, \
OPAL_NAME_PRINT(*(p)), (s))); \
OBJ_CONSTRUCT(&(_ilist), opal_list_t); \
_info = OBJ_NEW(opal_value_t); \
_info->key = strdup(OPAL_PMIX_OPTIONAL); \
_info->type = OPAL_BOOL; \
_info->data.flag = true; \
opal_list_append(&(_ilist), &(_info)->super); \
if (OPAL_SUCCESS != ((r) = opal_pmix.get((p), (s), &(_ilist), &(_kv)))) { \
*(d) = NULL; \
} else { \
(r) = opal_value_unload(_kv, (void**)(d), (t)); \
OBJ_RELEASE(_kv); \
} \
OPAL_LIST_DESTRUCT(&(_ilist)); \
} while(0);
/**
* Provide a simplified macro for retrieving modex data
* from another process:
@ -128,7 +165,7 @@ extern int opal_pmix_verbose_output;
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, \
OPAL_NAME_PRINT(*(p)), (s))); \
if (OPAL_SUCCESS != ((r) = opal_pmix.get((p), (s), &(_kv)))) { \
if (OPAL_SUCCESS != ((r) = opal_pmix.get((p), (s), NULL, &(_kv)))) { \
*(d) = NULL; \
} else { \
(r) = opal_value_unload(_kv, (void**)(d), (t)); \
@ -157,7 +194,7 @@ extern int opal_pmix_verbose_output;
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
__FILE__, __LINE__, \
OPAL_NAME_PRINT(*(p)), (s))); \
if (OPAL_SUCCESS == ((r) = opal_pmix.get((p), (s), &(_kv))) && \
if (OPAL_SUCCESS == ((r) = opal_pmix.get((p), (s), NULL, &(_kv))) && \
NULL != _kv) { \
*(d) = _kv->data.bo.bytes; \
*(sz) = _kv->data.bo.size; \
@ -301,23 +338,24 @@ typedef int (*opal_pmix_base_module_put_fn_t)(opal_pmix_scope_t scope,
opal_value_t *val);
/* Retrieve information for the specified _key_ as published by the rank
* and jobid i the provided opal_process_name, returning a pointer to the value in the
* given address.
* and jobid i the provided opal_process_name, and subject to any provided
* constraints, returning a pointer to the value in the given address.
*
* This is a blocking operation - the caller will block until
* the specified data has been _PMIx_Put_ by the specified rank. The caller is
* responsible for freeing all memory associated with the returned value when
* no longer required. */
typedef int (*opal_pmix_base_module_get_fn_t)(const opal_process_name_t *proc,
const char *key,
const char *key, opal_list_t *info,
opal_value_t **val);
/* Retrieve information for the specified _key_ as published by the given rank
* and jobid in the opal_process_name_t. This is a non-blocking operation - the
* and jobid in the opal_process_name_t, and subject to any provided
* constraints. This is a non-blocking operation - the
* callback function will be executed once the specified data has been _PMIx_Put_
* by the specified proc and retrieved by the local server. */
typedef int (*opal_pmix_base_module_get_nb_fn_t)(const opal_process_name_t *proc,
const char *key,
const char *key, opal_list_t *info,
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
/* Publish the given data to the "universal" nspace

Просмотреть файл

@ -160,12 +160,15 @@ BEGIN_C_DECLS
/* request-related info */
#define PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
#define PMIX_TIMEOUT "pmix.timeout" // (int) time in sec before specified operation should time out
#define PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until the specified #values are found
#define PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until at least the specified
// #values are found (0 => all and is the default)
#define PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective
#define PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory
#define PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job
#define PMIX_RANGE "pmix.range" // (int) pmix_data_range_t value for calls to publish/lookup/unpublish
#define PMIX_PERSISTENCE "pmix.persist" // (int) pmix_persistence_t value for calls to publish
#define PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do
// not request data from the server if not found
/* attributes used by host server to pass data to the server convenience library - the
* data will then be parsed and provided to the local clients */

Просмотреть файл

@ -117,6 +117,7 @@ pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char *key,
pmix_status_t rc;
char *nm;
pmix_nspace_t *ns, *nptr;
size_t n;
if (NULL == proc) {
return PMIX_ERR_BAD_PARAM;
@ -252,7 +253,7 @@ pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char *key,
* key to eventually be found, so all we can do is return
* the error */
pmix_output_verbose(2, pmix_globals.debug_output,
"Error requesting key=%s for rank = %d, namespace = %s\n",
"Error requesting key=%s for rank = %d, namespace = %s",
key, proc->rank, nm);
return rc;
}
@ -265,6 +266,18 @@ pmix_status_t PMIx_Get_nb(const pmix_proc_t *proc, const char *key,
return PMIX_ERR_NOT_FOUND;
}
/* we also have to check the user's directives to see if they do not want
* us to attempt to retrieve it from the server */
for (n=0; n < ninfo; n++) {
if (0 == strcmp(info[n].key, PMIX_OPTIONAL) &&
info[n].value.data.flag) {
/* they don't want us to try and retrieve it */
pmix_output_verbose(2, pmix_globals.debug_output,
"PMIx_Get key=%s for rank = %d, namespace = %s was not found - request was optional",
key, proc->rank, nm);
return PMIX_ERR_NOT_FOUND;
}
}
/* see if we already have a request in place with the server for data from
* this nspace:rank. If we do, then no need to ask again as the
* request will return _all_ data from that proc */

Просмотреть файл

@ -77,18 +77,18 @@ OPAL_MODULE_DECLSPEC int pmix1_client_init(void);
OPAL_MODULE_DECLSPEC int pmix1_client_finalize(void);
OPAL_MODULE_DECLSPEC int pmix1_initialized(void);
OPAL_MODULE_DECLSPEC int pmix1_abort(int flag, const char *msg,
opal_list_t *procs);
opal_list_t *procs);
OPAL_MODULE_DECLSPEC int pmix1_commit(void);
OPAL_MODULE_DECLSPEC int pmix1_fence(opal_list_t *procs, int collect_data);
OPAL_MODULE_DECLSPEC int pmix1_fencenb(opal_list_t *procs, int collect_data,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_put(opal_pmix_scope_t scope,
opal_value_t *val);
OPAL_MODULE_DECLSPEC int pmix1_get(const opal_process_name_t *proc,
const char *key, opal_value_t **val);
OPAL_MODULE_DECLSPEC int pmix1_getnb(const opal_process_name_t *proc,
const char *key,
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_get(const opal_process_name_t *proc, const char *key,
opal_list_t *info, opal_value_t **val);
OPAL_MODULE_DECLSPEC int pmix1_getnb(const opal_process_name_t *proc, const char *key,
opal_list_t *info,
opal_pmix_value_cbfunc_t cbfunc, void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_publish(opal_list_t *info);
OPAL_MODULE_DECLSPEC int pmix1_publishnb(opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
@ -100,17 +100,17 @@ OPAL_MODULE_DECLSPEC int pmix1_unpublishnb(char **keys, opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid);
OPAL_MODULE_DECLSPEC int pmix1_spawnnb(opal_list_t *job_info, opal_list_t *apps,
opal_pmix_spawn_cbfunc_t cbfunc, void *cbdata);
opal_pmix_spawn_cbfunc_t cbfunc, void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_connect(opal_list_t *procs);
OPAL_MODULE_DECLSPEC int pmix1_connectnb(opal_list_t *procs,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_disconnect(opal_list_t *procs);
OPAL_MODULE_DECLSPEC int pmix1_disconnectnb(opal_list_t *procs,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid,
opal_list_t *procs);
opal_list_t *procs);
OPAL_MODULE_DECLSPEC int pmix1_resolve_nodes(opal_jobid_t jobid, char **nodelist);
/**** COMMON FUNCTIONS ****/
@ -123,32 +123,32 @@ OPAL_MODULE_DECLSPEC int pmix1_server_finalize(void);
OPAL_MODULE_DECLSPEC int pmix1_server_gen_regex(const char *input, char **regex);
OPAL_MODULE_DECLSPEC int pmix1_server_gen_ppn(const char *input, char **ppn);
OPAL_MODULE_DECLSPEC int pmix1_server_register_nspace(opal_jobid_t jobid,
int nlocalprocs,
opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
int nlocalprocs,
opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_server_register_client(const opal_process_name_t *proc,
uid_t uid, gid_t gid,
void *server_object,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
uid_t uid, gid_t gid,
void *server_object,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_server_setup_fork(const opal_process_name_t *proc, char ***env);
OPAL_MODULE_DECLSPEC int pmix1_server_dmodex(const opal_process_name_t *proc,
opal_pmix_modex_cbfunc_t cbfunc, void *cbdata);
opal_pmix_modex_cbfunc_t cbfunc, void *cbdata);
OPAL_MODULE_DECLSPEC int pmix1_server_notify_error(int status,
opal_list_t *procs,
opal_list_t *error_procs,
opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
opal_list_t *procs,
opal_list_t *error_procs,
opal_list_t *info,
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
/**** COMPONENT UTILITY FUNCTIONS ****/
OPAL_MODULE_DECLSPEC pmix_status_t pmix1_convert_opalrc(int rc);
OPAL_MODULE_DECLSPEC int pmix1_convert_rc(pmix_status_t rc);
OPAL_MODULE_DECLSPEC void pmix1_value_load(pmix_value_t *v,
opal_value_t *kv);
opal_value_t *kv);
OPAL_MODULE_DECLSPEC int pmix1_value_unload(opal_value_t *kv,
const pmix_value_t *v);
const pmix_value_t *v);
END_C_DECLS

Просмотреть файл

@ -290,13 +290,16 @@ int pmix1_put(opal_pmix_scope_t scope,
return pmix1_convert_rc(rc);
}
int pmix1_get(const opal_process_name_t *proc,
const char *key, opal_value_t **val)
int pmix1_get(const opal_process_name_t *proc, const char *key,
opal_list_t *info, opal_value_t **val)
{
int ret;
pmix_value_t *kv;
pmix_status_t rc;
pmix_proc_t p, *pptr;
size_t ninfo, n;
pmix_info_t *pinfo;
opal_value_t *ival;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"%s PMIx_client get on proc %s key %s",
@ -326,8 +329,25 @@ int pmix1_get(const opal_process_name_t *proc,
pptr = NULL;
}
if (NULL != info) {
ninfo = opal_list_get_size(info);
if (0 < ninfo) {
PMIX_INFO_CREATE(pinfo, ninfo);
n=0;
OPAL_LIST_FOREACH(ival, info, opal_value_t) {
(void)strncpy(pinfo[n].key, ival->key, PMIX_MAX_KEYLEN);
pmix1_value_load(&pinfo[n].value, ival);
}
} else {
pinfo = NULL;
}
} else {
pinfo = NULL;
ninfo = 0;
}
/* pass the request down */
rc = PMIx_Get(pptr, key, NULL, 0, &kv);
rc = PMIx_Get(pptr, key, pinfo, ninfo, &kv);
if (PMIX_SUCCESS == rc) {
if (NULL == kv) {
ret = OPAL_SUCCESS;
@ -339,6 +359,7 @@ int pmix1_get(const opal_process_name_t *proc,
} else {
ret = pmix1_convert_rc(rc);
}
PMIX_INFO_FREE(pinfo, ninfo);
return ret;
}
@ -362,11 +383,14 @@ static void val_cbfunc(pmix_status_t status,
}
int pmix1_getnb(const opal_process_name_t *proc, const char *key,
opal_list_t *info,
opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
{
pmix1_opcaddy_t *op;
pmix_status_t rc;
char *tmp;
size_t n;
opal_value_t *ival;
opal_output_verbose(1, opal_pmix_base_framework.framework_output,
"%s PMIx_client get_nb on proc %s key %s",
@ -389,9 +413,20 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key,
op->p.rank = PMIX_RANK_WILDCARD;
}
if (NULL != info) {
op->sz = opal_list_get_size(info);
if (0 < op->sz) {
PMIX_INFO_CREATE(op->info, op->sz);
n=0;
OPAL_LIST_FOREACH(ival, info, opal_value_t) {
(void)strncpy(op->info[n].key, ival->key, PMIX_MAX_KEYLEN);
pmix1_value_load(&op->info[n].value, ival);
}
}
}
/* call the library function */
rc = PMIx_Get_nb(&op->p, key, NULL, 0, val_cbfunc, op);
rc = PMIx_Get_nb(&op->p, key, op->info, op->sz, val_cbfunc, op);
if (PMIX_SUCCESS != rc) {
OBJ_RELEASE(op);
}

Просмотреть файл

@ -98,12 +98,15 @@ BEGIN_C_DECLS
/* request-related info */
#define OPAL_PMIX_COLLECT_DATA "pmix.collect" // (bool) collect data and return it at the end of the operation
#define OPAL_PMIX_TIMEOUT "pmix.timeout" // (int) time in sec before specified operation should time out
#define OPAL_PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until the specified #values are found
#define OPAL_PMIX_WAIT "pmix.wait" // (int) caller requests that the server wait until at least the specified
// #values are found (0 => all and is the default)
#define OPAL_PMIX_COLLECTIVE_ALGO "pmix.calgo" // (char*) comma-delimited list of algorithms to use for collective
#define OPAL_PMIX_COLLECTIVE_ALGO_REQD "pmix.calreqd" // (bool) if true, indicates that the requested choice of algo is mandatory
#define OPAL_PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job
#define OPAL_PMIX_RANGE "pmix.range" // (int) opal_pmix_data_range_t value for calls to publish/lookup/unpublish
#define OPAL_PMIX_PERSISTENCE "pmix.persist" // (int) opal_pmix_persistence_t value for calls to publish
#define OPAL_PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do
// not request data from the server if not found
/* attribute used by host server to pass data to the server convenience library - the
* data will then be parsed and provided to the local clients */

Просмотреть файл

@ -39,7 +39,7 @@ static int s1_fence(opal_list_t *procs, int collect_data);
static int s1_put(opal_pmix_scope_t scope,
opal_value_t *kv);
static int s1_get(const opal_process_name_t *id,
const char *key,
const char *key, opal_list_t *info,
opal_value_t **kv);
static int s1_publish(opal_list_t *info);
static int s1_lookup(opal_list_t *data, opal_list_t *info);
@ -588,7 +588,7 @@ static int s1_fence(opal_list_t *procs, int collect_data)
}
static int s1_get(const opal_process_name_t *id,
const char *key,
const char *key, opal_list_t *info,
opal_value_t **kv)
{
int rc;

Просмотреть файл

@ -46,7 +46,7 @@ static int s2_fence(opal_list_t *procs, int collect_data);
static int s2_put(opal_pmix_scope_t scope,
opal_value_t *kv);
static int s2_get(const opal_process_name_t *id,
const char *key,
const char *key, opal_list_t *info,
opal_value_t **kv);
static int s2_publish(opal_list_t *info);
static int s2_lookup(opal_list_t *data, opal_list_t *info);
@ -607,7 +607,7 @@ static int s2_fence(opal_list_t *procs, int collect_data)
}
static int s2_get(const opal_process_name_t *id,
const char *key,
const char *key, opal_list_t *info,
opal_value_t **kv)
{
int rc;

Просмотреть файл

@ -88,10 +88,9 @@ static int rte_init(void)
char *rmluri;
opal_value_t *kv;
char *val;
size_t sz;
int u32, *u32ptr;
uint16_t u16, *u16ptr;
char **peers=NULL, *mycpuset;
char **peers=NULL, *mycpuset, **cpusets=NULL;
opal_process_name_t name;
size_t i;
@ -153,8 +152,8 @@ static int rte_init(void)
/* get our app number from PMI - ok if not found */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_APPNUM,
ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
if (OPAL_SUCCESS == ret) {
orte_process_info.app_num = u32;
} else {
@ -190,8 +189,8 @@ static int rte_init(void)
}
/* retrieve our topology */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO,
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
/* load the topology */
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
@ -259,84 +258,77 @@ static int rte_init(void)
/* get our local peers */
if (0 < orte_process_info.num_local_peers) {
/* if my local rank if too high, then that's an error */
if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) {
ret = ORTE_ERR_BAD_PARAM;
error = "num local peers";
goto error;
}
/* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
peers = opal_argv_split(val, ',');
free(val);
/* and their cpusets, if available */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
cpusets = opal_argv_split(val, ':');
free(val);
} else {
cpusets = NULL;
}
} else {
peers = NULL;
cpusets = NULL;
}
} else {
peers = NULL;
}
/* get our cpuset */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_CPUSET, ORTE_PROC_MY_NAME, &val, OPAL_STRING);
if (OPAL_SUCCESS != ret || NULL == val) {
/* if we don't have a cpuset, or it is NULL, then we declare our local
* peers to be on the same node and everyone else to be non-local */
mycpuset = NULL;
} else {
mycpuset = val;
cpusets = NULL;
}
/* set the locality */
name.jobid = ORTE_PROC_MY_NAME->jobid;
for (sz=0; sz < orte_process_info.num_procs; sz++) {
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALITY);
kv->type = OPAL_UINT16;
name.vpid = sz;
if (sz == ORTE_PROC_MY_NAME->vpid) {
/* we are fully local to ourselves */
u16 = OPAL_PROC_ALL_LOCAL;
} else if (NULL == peers) {
/* nobody is local to us */
u16 = OPAL_PROC_NON_LOCAL;
if (NULL != peers) {
/* indentify our cpuset */
if (NULL != cpusets) {
mycpuset = cpusets[orte_process_info.my_local_rank];
} else {
for (i=0; NULL != peers[i]; i++) {
if (sz == strtoul(peers[i], NULL, 10)) {
break;
}
}
if (NULL == peers[i]) {
/* not a local peer */
u16 = OPAL_PROC_NON_LOCAL;
} else if (NULL == mycpuset) {
mycpuset = NULL;
}
name.jobid = ORTE_PROC_MY_NAME->jobid;
for (i=0; NULL != peers[i]; i++) {
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALITY);
kv->type = OPAL_UINT16;
name.vpid = strtoul(peers[i], NULL, 10);
if (name.vpid == ORTE_PROC_MY_NAME->vpid) {
/* we are fully local to ourselves */
u16 = OPAL_PROC_ALL_LOCAL;
} else if (NULL == mycpuset || NULL == cpusets[i] ||
0 == strcmp(cpusets[i], "UNBOUND")) {
/* all we can say is that it shares our node */
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
} else {
/* attempt to get their cpuset */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_CPUSET, &name, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
/* we have it, so compute the locality */
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
mycpuset, val);
free(val);
} else {
/* all we can say is that it shares our node */
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
}
/* we have it, so compute the locality */
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, mycpuset, cpusets[i]);
}
}
kv->data.uint16 = u16;
ret = opal_pmix.store_local(&name, kv);
if (OPAL_SUCCESS != ret) {
error = "local store of locality";
if (NULL != mycpuset) {
free(mycpuset);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
"%s ess:pmi:locality: proc %s locality %x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), u16));
kv->data.uint16 = u16;
ret = opal_pmix.store_local(&name, kv);
if (OPAL_SUCCESS != ret) {
error = "local store of locality";
opal_argv_free(peers);
opal_argv_free(cpusets);
goto error;
}
opal_argv_free(peers);
goto error;
OBJ_RELEASE(kv);
}
OBJ_RELEASE(kv);
opal_argv_free(peers);
opal_argv_free(cpusets);
}
if (NULL != mycpuset){
free(mycpuset);
}
opal_argv_free(peers);
/* now that we have all required info, complete the setup */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) {

Просмотреть файл

@ -40,7 +40,7 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
bool msg_sent;
mca_oob_base_component_t *component;
bool reachable;
opal_value_t *kv;
char *uri;
/* done with this. release it now */
OBJ_RELEASE(cd);
@ -61,11 +61,14 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&msg->dst));
/* for direct launched procs, the URI might be in the database,
* so check there next - if it is, the peer object will be added
* to our hash table
* to our hash table. However, we don't want to chase up to the
* server after it, so indicate it is optional
*/
if (OPAL_SUCCESS == opal_pmix.get(&msg->dst, OPAL_PMIX_PROC_URI, &kv)) {
if (NULL != kv) {
process_uri(kv->data.string);
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_PROC_URI, &msg->dst,
(char**)&uri, OPAL_STRING);
if (OPAL_SUCCESS == rc ) {
if (NULL != uri) {
process_uri(uri);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
ui64, (void**)&pr) ||
NULL == pr) {

Просмотреть файл

@ -481,6 +481,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
/* ask our local pmix server for the data */
if (OPAL_SUCCESS != (rc = opal_pmix.server_dmodex_request(&idreq, modex_resp, req))) {
ORTE_ERROR_LOG(rc);
opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
OBJ_RELEASE(req);
send_error(rc, &idreq, sender);

Просмотреть файл

@ -236,6 +236,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
/* construct the list of local peers, while adding
* each proc's locality info */
list = NULL;
procs = NULL;
vpid = ORTE_VPID_MAX;
for (i=0; i < node->procs->size; i++) {
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
@ -249,13 +250,15 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
/* note that we have to pass the cpuset for each local
* peer so locality can be computed */
tmp = NULL;
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_CPUSET);
kv->type = OPAL_STRING;
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
kv->data.string = tmp;
if (NULL != tmp) {
opal_argv_append_nosize(&procs, tmp);
} else {
opal_argv_append_nosize(&procs, "UNBOUND");
}
} else {
opal_argv_append_nosize(&procs, "UNBOUND");
}
opal_list_append(info, &kv->super);
/* go ahead and register this client */
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
(void*)pptr, NULL, NULL))) {
@ -276,6 +279,18 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.string = tmp;
opal_list_append(info, &kv->super);
}
/* construct the list of cpusets for transmission */
if (NULL != procs) {
tmp = opal_argv_join(procs, ':');
opal_argv_free(procs);
procs = NULL;
/* pass the list of cpusets */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
kv->type = OPAL_STRING;
kv->data.string = tmp;
opal_list_append(info, &kv->super);
}
/* pass the local ldr */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALLDR);