From 9658256a98c1aa16b488209cee9b7409e2b45957 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 13 Dec 2014 18:44:09 -0800 Subject: [PATCH] Restore the passing of the complete job map to the local proc on first get_attr so the info can be used by the MPI layer without continual calls back to the server. We'll find a more memory efficient method later. --- opal/mca/pmix/native/pmix_native.c | 102 ++++++++++++++++-- orte/orted/pmix/pmix_server_sendrecv.c | 144 +++++++++++++++++-------- 2 files changed, 195 insertions(+), 51 deletions(-) diff --git a/opal/mca/pmix/native/pmix_native.c b/opal/mca/pmix/native/pmix_native.c index ce1a458037..1ff007c998 100644 --- a/opal/mca/pmix/native/pmix_native.c +++ b/opal/mca/pmix/native/pmix_native.c @@ -1027,7 +1027,7 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) uint32_t i, myrank; opal_process_name_t id; char *cpuset; - opal_buffer_t buf; + opal_buffer_t buf, buf2; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:native get_attr called", @@ -1166,7 +1166,95 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) } cnt=1; continue; + } else if (0 == strcmp(PMIX_PROC_MAP, kp->key)) { + opal_output_verbose(2, opal_pmix_base_framework.framework_output, + "%s received proc map", + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); + /* transfer the byte object for unpacking */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size); + kp->data.bo.bytes = NULL; // protect the data region + kp->data.bo.size = 0; + OBJ_RELEASE(kp); + /* get the jobid */ + cnt=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) { + OPAL_ERROR_LOG(rc); + OBJ_DESTRUCT(&buf); + cnt = 1; + return false; + } + if (0 != strcmp(PMIX_JOBID, kp->key)) { + OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM); + OBJ_DESTRUCT(&buf); + OBJ_RELEASE(kp); + cnt = 1; + return false; + } + id.jobid = kp->data.uint32; + OBJ_RELEASE(kp); + /* unpack the data for each rank */ + cnt=1; + while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) { + if (0 != strcmp(PMIX_RANK, kp->key)) { + OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM); + OBJ_DESTRUCT(&buf); + OBJ_RELEASE(kp); + cnt = 1; + return false; + } + id.vpid = kp->data.uint32; + /* unpack the blob for this rank */ + cnt=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) { + OPAL_ERROR_LOG(rc); + OBJ_DESTRUCT(&buf); + cnt = 1; + return false; + } + if (0 != strcmp(PMIX_PROC_MAP, kp->key)) { + OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM); + OBJ_DESTRUCT(&buf); + OBJ_RELEASE(kp); + cnt = 1; + return false; + } + /* transfer the byte object for unpacking */ + OBJ_CONSTRUCT(&buf2, opal_buffer_t); + opal_dss.load(&buf2, kp->data.bo.bytes, kp->data.bo.size); + kp->data.bo.bytes = NULL; // protect the data region + kp->data.bo.size = 0; + OBJ_RELEASE(kp); + /* unpack and store the map */ + cnt=1; + while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf2, &kp, &cnt, OPAL_VALUE))) { + opal_output_verbose(2, opal_pmix_base_framework.framework_output, + "%s storing key %s for peer %s", + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), + kp->key, OPAL_NAME_PRINT(id)); + if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, kp))) { + OPAL_ERROR_LOG(rc); + OBJ_RELEASE(kp); + OBJ_DESTRUCT(&buf2); + return false; + } + } + OBJ_DESTRUCT(&buf2); + if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + OPAL_ERROR_LOG(rc); + return false; + } + cnt=1; + } + OBJ_DESTRUCT(&buf); + if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { + OPAL_ERROR_LOG(rc); + return false; + } + cnt=1; + continue; } + /* otherwise, it is a single piece of info, so store it */ if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &OPAL_PROC_MY_NAME, kp))) { OPAL_ERROR_LOG(rc); OBJ_RELEASE(kp); @@ -1216,6 +1304,8 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) /* baseline all the procs as nonlocal */ myrank = native_pname.vpid; + id.jobid = native_pname.jobid; + /* we only need to set locality for each local rank as "not found" * equates to "non local" */ ranks = opal_argv_split(lclpeers->data.string, ','); @@ -1224,15 +1314,15 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) if (myrank == vid) { continue; } - native_pname.vpid = vid; + id.vpid = vid; #if OPAL_HAVE_HWLOC OBJ_CONSTRUCT(&vals, opal_list_t); - if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &native_pname, + if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &id, OPAL_DSTORE_CPUSET, &vals))) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s cpuset for local proc %s not found", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - OPAL_NAME_PRINT(native_pname)); + OPAL_NAME_PRINT(id)); OPAL_LIST_DESTRUCT(&vals); /* even though the cpuset wasn't found, we at least know it is * on the same node with us */ @@ -1259,14 +1349,14 @@ static bool native_get_attr(const char *attr, opal_value_t **kv) OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:native proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - OPAL_NAME_PRINT(native_pname), + OPAL_NAME_PRINT(id), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; - (void)opal_dstore.store(opal_dstore_internal, &native_pname, &kvn); + (void)opal_dstore.store(opal_dstore_internal, &id, &kvn); OBJ_DESTRUCT(&kvn); } opal_argv_free(ranks); diff --git a/orte/orted/pmix/pmix_server_sendrecv.c b/orte/orted/pmix/pmix_server_sendrecv.c index 8c437429f5..7887430a3a 100644 --- a/orte/orted/pmix/pmix_server_sendrecv.c +++ b/orte/orted/pmix/pmix_server_sendrecv.c @@ -333,7 +333,7 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_ int i; char **list; orte_process_name_t name; - opal_buffer_t buf; + opal_buffer_t buf, buf2; /* convenience def */ node = proc->node; @@ -388,17 +388,6 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_ return rc; } OBJ_DESTRUCT(&kv); - /* appnum */ - OBJ_CONSTRUCT(&kv, opal_value_t); - kv.key = strdup(PMIX_APPNUM); - kv.type = OPAL_UINT32; - kv.data.uint32 = proc->app_idx; - if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&kv); - return rc; - } - OBJ_DESTRUCT(&kv); /* rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(PMIX_RANK); @@ -409,29 +398,6 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_ OBJ_DESTRUCT(&kv); return rc; } - OBJ_DESTRUCT(&kv); - /* global rank */ - OBJ_CONSTRUCT(&kv, opal_value_t); - kv.key = strdup(PMIX_GLOBAL_RANK); - kv.type = OPAL_UINT32; - kv.data.uint32 = proc->name.vpid + jdata->offset; - if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&kv); - return rc; - } - OBJ_DESTRUCT(&kv); - /* app rank */ - OBJ_CONSTRUCT(&kv, opal_value_t); - kv.key = strdup(PMIX_APP_RANK); - kv.type = OPAL_UINT32; - kv.data.uint32 = proc->app_rank; - if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&kv); - return rc; - } - OBJ_DESTRUCT(&kv); /* offset */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(PMIX_NPROC_OFFSET); @@ -443,22 +409,110 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_ return rc; } OBJ_DESTRUCT(&kv); - /* local rank */ + /* pass a blob - for each proc in this job, include the info describing + * it so the recipient has a complete picture */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + /* jobid, for simplicity when unpacking */ OBJ_CONSTRUCT(&kv, opal_value_t); - kv.key = strdup(PMIX_LOCAL_RANK); - kv.type = OPAL_UINT16; - kv.data.uint16 = proc->local_rank; - if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) { + kv.key = strdup(PMIX_JOBID); + kv.type = OPAL_UINT32; + kv.data.uint32 = proc->name.jobid; + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); return rc; } - OBJ_DESTRUCT(&kv); - /* node rank */ + for (i=0; i < jdata->procs->size; i++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + /* rank */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(PMIX_RANK); + kv.type = OPAL_UINT32; + kv.data.uint32 = pptr->name.vpid; + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + return rc; + } + OBJ_DESTRUCT(&kv); + /* create the buffer for this rank */ + OBJ_CONSTRUCT(&buf2, opal_buffer_t); + /* appnum */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(PMIX_APPNUM); + kv.type = OPAL_UINT32; + kv.data.uint32 = pptr->app_idx; + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + return rc; + } + OBJ_DESTRUCT(&kv); + /* global rank */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(PMIX_GLOBAL_RANK); + kv.type = OPAL_UINT32; + kv.data.uint32 = pptr->name.vpid + jdata->offset; + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + return rc; + } + OBJ_DESTRUCT(&kv); + /* app rank */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(PMIX_APP_RANK); + kv.type = OPAL_UINT32; + kv.data.uint32 = pptr->app_rank; + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + return rc; + } + OBJ_DESTRUCT(&kv); + /* local rank */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(PMIX_LOCAL_RANK); + kv.type = OPAL_UINT16; + kv.data.uint16 = pptr->local_rank; + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + return rc; + } + OBJ_DESTRUCT(&kv); + /* node rank */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(PMIX_NODE_RANK); + kv.type = OPAL_UINT16; + kv.data.uint16 = pptr->node_rank; + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf2, &kp, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + return rc; + } + OBJ_DESTRUCT(&kv); + /* add the rank's blob */ + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(PMIX_PROC_MAP); + kv.type = OPAL_BYTE_OBJECT; + opal_dss.unload(&buf2, (void**)&kv.data.bo.bytes, &kv.data.bo.size); + OBJ_DESTRUCT(&buf2); + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&kv); + return rc; + } + OBJ_DESTRUCT(&kv); + } + /* now pass the blob as the proc-map key */ OBJ_CONSTRUCT(&kv, opal_value_t); - kv.key = strdup(PMIX_NODE_RANK); - kv.type = OPAL_UINT16; - kv.data.uint16 = proc->node_rank; + kv.key = strdup(PMIX_PROC_MAP); + kv.type = OPAL_BYTE_OBJECT; + opal_dss.unload(&buf, (void**)&kv.data.bo.bytes, &kv.data.bo.size); + OBJ_DESTRUCT(&buf); if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &kp, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv);