1
1

Fix shared memory operations by resolving local peers

Этот коммит содержится в:
Ralph Castain 2015-08-30 12:00:22 -07:00
родитель 305dc5317b
Коммит 38ba54366c
3 изменённых файлов: 43 добавлений и 38 удалений

Просмотреть файл

@ -292,7 +292,7 @@ pmix_status_t pmix_pending_resolve(pmix_nspace_t *nptr, int rank, pmix_dmdx_loca
if( NULL == lcd ){ if( NULL == lcd ){
PMIX_LIST_FOREACH(cd, &pmix_server_globals.local_reqs, pmix_dmdx_local_t) { PMIX_LIST_FOREACH(cd, &pmix_server_globals.local_reqs, pmix_dmdx_local_t) {
if (0 != strncmp(nptr->nspace, cd->proc.nspace, PMIX_MAX_NSLEN) || if (0 != strncmp(nptr->nspace, cd->proc.nspace, PMIX_MAX_NSLEN) ||
rank != cd->proc.rank) { rank != cd->proc.rank) {
continue; continue;
} }
lcd = cd; lcd = cd;
@ -913,7 +913,7 @@ static void dmdx_cbfunc(pmix_status_t status,
caddy = PMIX_NEW(pmix_dmdx_reply_caddy_t); caddy = PMIX_NEW(pmix_dmdx_reply_caddy_t);
caddy->status = status; caddy->status = status;
/* point to the callers cbfunc */ /* point to the callers cbfunc */
caddy->relcbfunc - release_fn; caddy->relcbfunc = release_fn;
caddy->cbdata = release_cbdata; caddy->cbdata = release_cbdata;
caddy->data = data; caddy->data = data;

Просмотреть файл

@ -91,7 +91,7 @@ static int rte_init(void)
size_t sz; size_t sz;
int u32, *u32ptr; int u32, *u32ptr;
uint16_t u16, *u16ptr; uint16_t u16, *u16ptr;
char **peers=NULL, **cpusets=NULL, *mycpuset; char **peers=NULL, *mycpuset;
opal_process_name_t name; opal_process_name_t name;
size_t i; size_t i;
@ -230,6 +230,11 @@ static int rte_init(void)
goto error; goto error;
} }
free(val); free(val);
/* filter the cpus thru any default cpu set */
if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
error = "filtering topology";
goto error;
}
} else { } else {
/* it wasn't passed down to us, so go get it */ /* it wasn't passed down to us, so go get it */
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
@ -256,32 +261,16 @@ static int rte_init(void)
/* get our local peers */ /* get our local peers */
if (0 < orte_process_info.num_local_peers) { if (0 < orte_process_info.num_local_peers) {
/* retrieve the local peers */ /* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, NULL, &val, OPAL_STRING); OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) { if (OPAL_SUCCESS == ret && NULL != val) {
peers = opal_argv_split(val, ','); peers = opal_argv_split(val, ',');
free(val); free(val);
/* and their cpusets */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_CPUSETS, NULL, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
cpusets = opal_argv_split(val, ':');
free(val);
if (opal_argv_count(peers) != opal_argv_count(cpusets)) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
opal_argv_free(peers);
opal_argv_free(cpusets);
error = "mismatch #local peers and #cpusets";
goto error;
}
} else {
cpusets = NULL;
}
} else { } else {
peers = NULL; peers = NULL;
cpusets = NULL;
} }
} else { } else {
peers = NULL; peers = NULL;
cpusets = NULL;
} }
/* get our cpuset */ /* get our cpuset */
@ -301,7 +290,10 @@ static int rte_init(void)
kv->key = strdup(OPAL_PMIX_LOCALITY); kv->key = strdup(OPAL_PMIX_LOCALITY);
kv->type = OPAL_UINT16; kv->type = OPAL_UINT16;
name.vpid = sz; name.vpid = sz;
if (NULL == peers) { if (sz == ORTE_PROC_MY_NAME->vpid) {
/* we are fully local to ourselves */
u16 = OPAL_PROC_ALL_LOCAL;
} else if (NULL == peers) {
/* nobody is local to us */ /* nobody is local to us */
u16 = OPAL_PROC_NON_LOCAL; u16 = OPAL_PROC_NON_LOCAL;
} else { } else {
@ -313,30 +305,44 @@ static int rte_init(void)
if (NULL == peers[i]) { if (NULL == peers[i]) {
/* not a local peer */ /* not a local peer */
u16 = OPAL_PROC_NON_LOCAL; u16 = OPAL_PROC_NON_LOCAL;
} else if (NULL == mycpuset || NULL == cpusets || NULL == cpusets[i]) { } else if (NULL == mycpuset) {
/* all we can say is that it shares our node */
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
} else if (NULL != cpusets && NULL != cpusets[i]) {
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
mycpuset, cpusets[i]);
} else { } else {
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; /* attempt to get their cpuset */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_CPUSET, &name, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
/* we have it, so compute the locality */
u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
mycpuset, val);
free(val);
} else {
/* all we can say is that it shares our node */
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
}
} }
} }
kv->data.uint16 = u16; kv->data.uint16 = u16;
ret = opal_pmix.store_local(&name, kv); ret = opal_pmix.store_local(&name, kv);
if (OPAL_SUCCESS != ret) { if (OPAL_SUCCESS != ret) {
error = "local store of locality"; error = "local store of locality";
opal_argv_free(cpusets); if (NULL != mycpuset) {
free(mycpuset);
}
opal_argv_free(peers); opal_argv_free(peers);
goto error; goto error;
} }
OBJ_RELEASE(kv); OBJ_RELEASE(kv);
} }
if (NULL != mycpuset){
free(mycpuset);
}
#else #else
/* get our local peers */ /* get our local peers */
if (0 < orte_process_info.num_local_peers) { if (0 < orte_process_info.num_local_peers) {
/* retrieve the local peers */ /* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, NULL, &val, OPAL_STRING); OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) { if (OPAL_SUCCESS == ret && NULL != val) {
peers = opal_argv_split(val, ','); peers = opal_argv_split(val, ',');
free(val); free(val);
@ -353,8 +359,11 @@ static int rte_init(void)
kv->key = strdup(OPAL_PMIX_LOCALITY); kv->key = strdup(OPAL_PMIX_LOCALITY);
kv->type = OPAL_UINT16; kv->type = OPAL_UINT16;
name.vpid = sz; name.vpid = sz;
if (NULL == peers) { if (sz == ORTE_PROC_MY_NAME->vpid) {
/* nobody is local to us */ /* we are fully local to ourselves */
u16 = OPAL_PROC_ALL_LOCAL;
} else if (NULL == peers) {
/* nobody is local to us */
u16 = OPAL_PROC_NON_LOCAL; u16 = OPAL_PROC_NON_LOCAL;
} else { } else {
for (i=0; NULL != peers[i]; i++) { for (i=0; NULL != peers[i]; i++) {
@ -370,8 +379,8 @@ static int rte_init(void)
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
} }
} }
/* store this data internally - not to be pushed outside of /* store this data internally - not to be pushed outside of
* ourselves as it only has meaning relative to us */ * ourselves as it only has meaning relative to us */
ret = opal_pmix.store_local(&name, kv); ret = opal_pmix.store_local(&name, kv);
if (OPAL_SUCCESS != ret) { if (OPAL_SUCCESS != ret) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
@ -384,7 +393,6 @@ static int rte_init(void)
} }
#endif #endif
opal_argv_free(peers); opal_argv_free(peers);
opal_argv_free(cpusets);
/* we don't need to force the routed system to pick the /* we don't need to force the routed system to pick the
* "direct" component as that should happen automatically * "direct" component as that should happen automatically

Просмотреть файл

@ -255,10 +255,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->key = strdup(OPAL_PMIX_CPUSET); kv->key = strdup(OPAL_PMIX_CPUSET);
kv->type = OPAL_STRING; kv->type = OPAL_STRING;
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) { if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
(void)asprintf(&kv->data.string, "%s:%s", ORTE_NAME_PRINT(&pptr->name), tmp); kv->data.string = tmp;
free(tmp);
} else {
(void)asprintf(&kv->data.string, "%s", ORTE_NAME_PRINT(&pptr->name));
} }
opal_list_append(info, &kv->super); opal_list_append(info, &kv->super);
/* go ahead and register this client */ /* go ahead and register this client */