Fix shared memory operations by resolving local peers
Этот коммит содержится в:
родитель
305dc5317b
Коммит
38ba54366c
@ -292,7 +292,7 @@ pmix_status_t pmix_pending_resolve(pmix_nspace_t *nptr, int rank, pmix_dmdx_loca
|
|||||||
if( NULL == lcd ){
|
if( NULL == lcd ){
|
||||||
PMIX_LIST_FOREACH(cd, &pmix_server_globals.local_reqs, pmix_dmdx_local_t) {
|
PMIX_LIST_FOREACH(cd, &pmix_server_globals.local_reqs, pmix_dmdx_local_t) {
|
||||||
if (0 != strncmp(nptr->nspace, cd->proc.nspace, PMIX_MAX_NSLEN) ||
|
if (0 != strncmp(nptr->nspace, cd->proc.nspace, PMIX_MAX_NSLEN) ||
|
||||||
rank != cd->proc.rank) {
|
rank != cd->proc.rank) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
lcd = cd;
|
lcd = cd;
|
||||||
@ -913,7 +913,7 @@ static void dmdx_cbfunc(pmix_status_t status,
|
|||||||
caddy = PMIX_NEW(pmix_dmdx_reply_caddy_t);
|
caddy = PMIX_NEW(pmix_dmdx_reply_caddy_t);
|
||||||
caddy->status = status;
|
caddy->status = status;
|
||||||
/* point to the callers cbfunc */
|
/* point to the callers cbfunc */
|
||||||
caddy->relcbfunc - release_fn;
|
caddy->relcbfunc = release_fn;
|
||||||
caddy->cbdata = release_cbdata;
|
caddy->cbdata = release_cbdata;
|
||||||
|
|
||||||
caddy->data = data;
|
caddy->data = data;
|
||||||
|
@ -91,7 +91,7 @@ static int rte_init(void)
|
|||||||
size_t sz;
|
size_t sz;
|
||||||
int u32, *u32ptr;
|
int u32, *u32ptr;
|
||||||
uint16_t u16, *u16ptr;
|
uint16_t u16, *u16ptr;
|
||||||
char **peers=NULL, **cpusets=NULL, *mycpuset;
|
char **peers=NULL, *mycpuset;
|
||||||
opal_process_name_t name;
|
opal_process_name_t name;
|
||||||
size_t i;
|
size_t i;
|
||||||
|
|
||||||
@ -230,6 +230,11 @@ static int rte_init(void)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
free(val);
|
free(val);
|
||||||
|
/* filter the cpus thru any default cpu set */
|
||||||
|
if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
|
||||||
|
error = "filtering topology";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
/* it wasn't passed down to us, so go get it */
|
/* it wasn't passed down to us, so go get it */
|
||||||
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
|
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
|
||||||
@ -256,32 +261,16 @@ static int rte_init(void)
|
|||||||
/* get our local peers */
|
/* get our local peers */
|
||||||
if (0 < orte_process_info.num_local_peers) {
|
if (0 < orte_process_info.num_local_peers) {
|
||||||
/* retrieve the local peers */
|
/* retrieve the local peers */
|
||||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, NULL, &val, OPAL_STRING);
|
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
|
||||||
|
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
|
||||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||||
peers = opal_argv_split(val, ',');
|
peers = opal_argv_split(val, ',');
|
||||||
free(val);
|
free(val);
|
||||||
/* and their cpusets */
|
|
||||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_CPUSETS, NULL, &val, OPAL_STRING);
|
|
||||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
|
||||||
cpusets = opal_argv_split(val, ':');
|
|
||||||
free(val);
|
|
||||||
if (opal_argv_count(peers) != opal_argv_count(cpusets)) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
||||||
opal_argv_free(peers);
|
|
||||||
opal_argv_free(cpusets);
|
|
||||||
error = "mismatch #local peers and #cpusets";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
cpusets = NULL;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
peers = NULL;
|
peers = NULL;
|
||||||
cpusets = NULL;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
peers = NULL;
|
peers = NULL;
|
||||||
cpusets = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get our cpuset */
|
/* get our cpuset */
|
||||||
@ -301,7 +290,10 @@ static int rte_init(void)
|
|||||||
kv->key = strdup(OPAL_PMIX_LOCALITY);
|
kv->key = strdup(OPAL_PMIX_LOCALITY);
|
||||||
kv->type = OPAL_UINT16;
|
kv->type = OPAL_UINT16;
|
||||||
name.vpid = sz;
|
name.vpid = sz;
|
||||||
if (NULL == peers) {
|
if (sz == ORTE_PROC_MY_NAME->vpid) {
|
||||||
|
/* we are fully local to ourselves */
|
||||||
|
u16 = OPAL_PROC_ALL_LOCAL;
|
||||||
|
} else if (NULL == peers) {
|
||||||
/* nobody is local to us */
|
/* nobody is local to us */
|
||||||
u16 = OPAL_PROC_NON_LOCAL;
|
u16 = OPAL_PROC_NON_LOCAL;
|
||||||
} else {
|
} else {
|
||||||
@ -313,30 +305,44 @@ static int rte_init(void)
|
|||||||
if (NULL == peers[i]) {
|
if (NULL == peers[i]) {
|
||||||
/* not a local peer */
|
/* not a local peer */
|
||||||
u16 = OPAL_PROC_NON_LOCAL;
|
u16 = OPAL_PROC_NON_LOCAL;
|
||||||
} else if (NULL == mycpuset || NULL == cpusets || NULL == cpusets[i]) {
|
} else if (NULL == mycpuset) {
|
||||||
|
/* all we can say is that it shares our node */
|
||||||
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||||
} else if (NULL != cpusets && NULL != cpusets[i]) {
|
|
||||||
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
|
||||||
mycpuset, cpusets[i]);
|
|
||||||
} else {
|
} else {
|
||||||
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
/* attempt to get their cpuset */
|
||||||
|
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_CPUSET, &name, &val, OPAL_STRING);
|
||||||
|
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||||
|
/* we have it, so compute the locality */
|
||||||
|
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||||
|
mycpuset, val);
|
||||||
|
free(val);
|
||||||
|
} else {
|
||||||
|
/* all we can say is that it shares our node */
|
||||||
|
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
kv->data.uint16 = u16;
|
kv->data.uint16 = u16;
|
||||||
ret = opal_pmix.store_local(&name, kv);
|
ret = opal_pmix.store_local(&name, kv);
|
||||||
if (OPAL_SUCCESS != ret) {
|
if (OPAL_SUCCESS != ret) {
|
||||||
error = "local store of locality";
|
error = "local store of locality";
|
||||||
opal_argv_free(cpusets);
|
if (NULL != mycpuset) {
|
||||||
|
free(mycpuset);
|
||||||
|
}
|
||||||
opal_argv_free(peers);
|
opal_argv_free(peers);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
OBJ_RELEASE(kv);
|
OBJ_RELEASE(kv);
|
||||||
}
|
}
|
||||||
|
if (NULL != mycpuset){
|
||||||
|
free(mycpuset);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
/* get our local peers */
|
/* get our local peers */
|
||||||
if (0 < orte_process_info.num_local_peers) {
|
if (0 < orte_process_info.num_local_peers) {
|
||||||
/* retrieve the local peers */
|
/* retrieve the local peers */
|
||||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, NULL, &val, OPAL_STRING);
|
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
|
||||||
|
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
|
||||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||||
peers = opal_argv_split(val, ',');
|
peers = opal_argv_split(val, ',');
|
||||||
free(val);
|
free(val);
|
||||||
@ -353,8 +359,11 @@ static int rte_init(void)
|
|||||||
kv->key = strdup(OPAL_PMIX_LOCALITY);
|
kv->key = strdup(OPAL_PMIX_LOCALITY);
|
||||||
kv->type = OPAL_UINT16;
|
kv->type = OPAL_UINT16;
|
||||||
name.vpid = sz;
|
name.vpid = sz;
|
||||||
if (NULL == peers) {
|
if (sz == ORTE_PROC_MY_NAME->vpid) {
|
||||||
/* nobody is local to us */
|
/* we are fully local to ourselves */
|
||||||
|
u16 = OPAL_PROC_ALL_LOCAL;
|
||||||
|
} else if (NULL == peers) {
|
||||||
|
/* nobody is local to us */
|
||||||
u16 = OPAL_PROC_NON_LOCAL;
|
u16 = OPAL_PROC_NON_LOCAL;
|
||||||
} else {
|
} else {
|
||||||
for (i=0; NULL != peers[i]; i++) {
|
for (i=0; NULL != peers[i]; i++) {
|
||||||
@ -370,8 +379,8 @@ static int rte_init(void)
|
|||||||
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* store this data internally - not to be pushed outside of
|
/* store this data internally - not to be pushed outside of
|
||||||
* ourselves as it only has meaning relative to us */
|
* ourselves as it only has meaning relative to us */
|
||||||
ret = opal_pmix.store_local(&name, kv);
|
ret = opal_pmix.store_local(&name, kv);
|
||||||
if (OPAL_SUCCESS != ret) {
|
if (OPAL_SUCCESS != ret) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
@ -384,7 +393,6 @@ static int rte_init(void)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
opal_argv_free(peers);
|
opal_argv_free(peers);
|
||||||
opal_argv_free(cpusets);
|
|
||||||
|
|
||||||
/* we don't need to force the routed system to pick the
|
/* we don't need to force the routed system to pick the
|
||||||
* "direct" component as that should happen automatically
|
* "direct" component as that should happen automatically
|
||||||
|
@ -255,10 +255,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
|||||||
kv->key = strdup(OPAL_PMIX_CPUSET);
|
kv->key = strdup(OPAL_PMIX_CPUSET);
|
||||||
kv->type = OPAL_STRING;
|
kv->type = OPAL_STRING;
|
||||||
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
|
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
|
||||||
(void)asprintf(&kv->data.string, "%s:%s", ORTE_NAME_PRINT(&pptr->name), tmp);
|
kv->data.string = tmp;
|
||||||
free(tmp);
|
|
||||||
} else {
|
|
||||||
(void)asprintf(&kv->data.string, "%s", ORTE_NAME_PRINT(&pptr->name));
|
|
||||||
}
|
}
|
||||||
opal_list_append(info, &kv->super);
|
opal_list_append(info, &kv->super);
|
||||||
/* go ahead and register this client */
|
/* go ahead and register this client */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user