Merge pull request #586 from hppritcha/topic/pmix_cray_loc_fix
pmix/cray: fix locality setting
Этот коммит содержится в:
Коммит
00dafb39f6
@ -20,6 +20,7 @@
|
|||||||
#include "opal/types.h"
|
#include "opal/types.h"
|
||||||
|
|
||||||
#include "opal_stdint.h"
|
#include "opal_stdint.h"
|
||||||
|
#include "opal/mca/hwloc/base/base.h"
|
||||||
#include "opal/util/opal_environ.h"
|
#include "opal/util/opal_environ.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/proc.h"
|
#include "opal/util/proc.h"
|
||||||
@ -105,7 +106,6 @@ static int *pmix_lranks = NULL;
|
|||||||
static opal_process_name_t pmix_pname;
|
static opal_process_name_t pmix_pname;
|
||||||
static uint32_t pmix_jobid = -1;
|
static uint32_t pmix_jobid = -1;
|
||||||
|
|
||||||
|
|
||||||
static char* pmix_error(int pmix_err);
|
static char* pmix_error(int pmix_err);
|
||||||
#define OPAL_PMI_ERROR(pmi_err, pmi_func) \
|
#define OPAL_PMI_ERROR(pmi_err, pmi_func) \
|
||||||
do { \
|
do { \
|
||||||
@ -316,7 +316,7 @@ static int cray_fence(opal_process_name_t *procs, size_t nprocs)
|
|||||||
int rc, cnt;
|
int rc, cnt;
|
||||||
int32_t i;
|
int32_t i;
|
||||||
int *all_lens = NULL;
|
int *all_lens = NULL;
|
||||||
opal_value_t *kp;
|
opal_value_t *kp, kvn;
|
||||||
opal_buffer_t *send_buffer = NULL;
|
opal_buffer_t *send_buffer = NULL;
|
||||||
opal_buffer_t *buf = NULL;
|
opal_buffer_t *buf = NULL;
|
||||||
void *sbuf_ptr;
|
void *sbuf_ptr;
|
||||||
@ -330,6 +330,9 @@ static int cray_fence(opal_process_name_t *procs, size_t nprocs)
|
|||||||
int32_t rcv_nbytes_tot;
|
int32_t rcv_nbytes_tot;
|
||||||
bytes_and_rank_t s_bytes_and_rank;
|
bytes_and_rank_t s_bytes_and_rank;
|
||||||
bytes_and_rank_t *r_bytes_and_ranks = NULL;
|
bytes_and_rank_t *r_bytes_and_ranks = NULL;
|
||||||
|
opal_hwloc_locality_t locality;
|
||||||
|
opal_list_t vals;
|
||||||
|
char *cpuset = NULL;
|
||||||
|
|
||||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
"%s pmix:cray executing fence on %u procs cache_global %p cache_local %p",
|
"%s pmix:cray executing fence on %u procs cache_global %p cache_local %p",
|
||||||
@ -447,10 +450,90 @@ static int cray_fence(opal_process_name_t *procs, size_t nprocs)
|
|||||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
"%s pmix:cray kvs_fence complete",
|
"%s pmix:cray kvs_fence complete",
|
||||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
|
||||||
|
|
||||||
|
#if OPAL_HAVE_HWLOC
|
||||||
|
/* fetch my cpuset */
|
||||||
|
OBJ_CONSTRUCT(&vals, opal_list_t);
|
||||||
|
if (OPAL_SUCCESS == (rc = opal_dstore.fetch(opal_dstore_internal, &pmix_pname,
|
||||||
|
OPAL_DSTORE_CPUSET, &vals))) {
|
||||||
|
kp = (opal_value_t*)opal_list_get_first(&vals);
|
||||||
|
cpuset = strdup(kp->data.string);
|
||||||
|
} else {
|
||||||
|
cpuset = NULL;
|
||||||
|
}
|
||||||
|
OPAL_LIST_DESTRUCT(&vals);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* we only need to set locality for each local rank as "not found"
|
||||||
|
* equates to "non-local" */
|
||||||
|
for (i=0; i < pmix_nlranks; i++) {
|
||||||
|
id.vpid = pmix_lranks[i];
|
||||||
|
id.jobid = pmix_jobid;
|
||||||
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
|
"%s checking out if %s is local to me",
|
||||||
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
|
OPAL_NAME_PRINT(id));
|
||||||
|
/* fetch cpuset for this vpid */
|
||||||
|
#if OPAL_HAVE_HWLOC
|
||||||
|
OBJ_CONSTRUCT(&vals, opal_list_t);
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &pmix_pname,
|
||||||
|
OPAL_DSTORE_CPUSET, &vals))) {
|
||||||
|
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||||
|
"%s cpuset for local proc %s not found",
|
||||||
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
|
OPAL_NAME_PRINT(id));
|
||||||
|
OPAL_LIST_DESTRUCT(&vals);
|
||||||
|
/* even though the cpuset wasn't found, we at least know it is
|
||||||
|
* on the same node with us */
|
||||||
|
locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||||
|
} else {
|
||||||
|
kp = (opal_value_t*)opal_list_get_first(&vals);
|
||||||
|
if (NULL == kp->data.string) {
|
||||||
|
/* if we share a node, but we don't know anything more, then
|
||||||
|
* mark us as on the node as this is all we know
|
||||||
|
*/
|
||||||
|
locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||||
|
} else {
|
||||||
|
/* determine relative location on our node */
|
||||||
|
locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
|
||||||
|
cpuset,
|
||||||
|
kp->data.string);
|
||||||
|
}
|
||||||
|
OPAL_LIST_DESTRUCT(&vals);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
/* all we know is we share a node */
|
||||||
|
locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
|
||||||
|
#endif
|
||||||
|
OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
|
||||||
|
"%s pmix:cray proc %s locality %s",
|
||||||
|
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
|
||||||
|
OPAL_NAME_PRINT(id),
|
||||||
|
opal_hwloc_base_print_locality(locality)));
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&kvn, opal_value_t);
|
||||||
|
kvn.key = strdup(OPAL_DSTORE_LOCALITY);
|
||||||
|
kvn.type = OPAL_UINT16;
|
||||||
|
kvn.data.uint16 = locality;
|
||||||
|
(void)opal_dstore.store(opal_dstore_internal, &id, &kvn);
|
||||||
|
OBJ_DESTRUCT(&kvn);
|
||||||
|
}
|
||||||
|
|
||||||
fn_exit:
|
fn_exit:
|
||||||
if (all_lens != NULL) free(all_lens);
|
#if OPAL_HAVE_HWLOC
|
||||||
if (rcv_buff != NULL) free(rcv_buff);
|
if (NULL != cpuset) {
|
||||||
if (r_bytes_and_ranks != NULL) free(r_bytes_and_ranks);
|
free(cpuset);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (all_lens != NULL) {
|
||||||
|
free(all_lens);
|
||||||
|
}
|
||||||
|
if (rcv_buff != NULL) {
|
||||||
|
free(rcv_buff);
|
||||||
|
}
|
||||||
|
if (r_bytes_and_ranks != NULL) {
|
||||||
|
free(r_bytes_and_ranks);
|
||||||
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user