Handle the non-blocking fence case correctly, and ensure we always at least pass back the hostname of the process whose info is being requested so that the ompi_proc_t can correctly initialize it when we are in a non-blocking fence with np < cutoff scenario
This commit was SVN r32578.
Этот коммит содержится в:
родитель
8f1b9b463e
Коммит
6ff2a60829
@ -50,6 +50,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_DSTORE_NODE_SIZE PMIX_NODE_SIZE
|
||||
#define OPAL_DSTORE_MAX_PROCS PMIX_MAX_PROCS
|
||||
#define OPAL_DSTORE_NPROC_OFFSET PMIX_NPROC_OFFSET
|
||||
#define OPAL_DSTORE_HOSTNAME PMIX_HOSTNAME
|
||||
|
||||
/* some OPAL-appropriate key definitions */
|
||||
#define OPAL_DSTORE_LOCALITY "opal.locality" // (uint16_t) relative locality of a peer
|
||||
@ -57,7 +58,6 @@ BEGIN_C_DECLS
|
||||
#define OPAL_DSTORE_JOB_SDIR "opal.job.session.dir" // (char*) job-level session dir
|
||||
#define OPAL_DSTORE_MY_SDIR "opal.my.session.dir" // (char*) session dir for this proc
|
||||
#define OPAL_DSTORE_URI "opal.uri" // (char*) uri of specified proc
|
||||
#define OPAL_DSTORE_HOSTNAME "opal.hostname" // (char*) hostname of specified proc
|
||||
#define OPAL_DSTORE_ARCH "opal.arch" // (uint32_t) arch for specified proc
|
||||
#define OPAL_DSTORE_HOSTID "opal.hostid" // (uint32_t) hostid of specified proc
|
||||
#define OPAL_DSTORE_NODEID "opal.nodeid" // (uint32_t) nodeid of specified proc
|
||||
|
@ -660,6 +660,17 @@ static int native_fence_nb(opal_process_name_t *procs, size_t nprocs,
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
/* provide our URI */
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(msg, &local_uri, 1, OPAL_STRING))) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(msg);
|
||||
return rc;
|
||||
}
|
||||
/* only do it once */
|
||||
if (NULL != local_uri) {
|
||||
free(local_uri);
|
||||
local_uri = NULL;
|
||||
}
|
||||
|
||||
/* if we haven't already done it, ensure we have committed our values */
|
||||
if (NULL != mca_pmix_native_component.cache_local) {
|
||||
@ -729,6 +740,7 @@ static int native_get(const opal_identifier_t *id,
|
||||
int32_t cnt;
|
||||
opal_list_t vals;
|
||||
opal_value_t *kp;
|
||||
bool found;
|
||||
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s pmix:native getting value for proc %s key %s",
|
||||
@ -789,39 +801,46 @@ static int native_get(const opal_identifier_t *id,
|
||||
OBJ_RELEASE(cb);
|
||||
return rc;
|
||||
}
|
||||
if (OPAL_SUCCESS == ret) {
|
||||
cnt = 1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
|
||||
if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, id, kp))) {
|
||||
OPAL_ERROR_LOG(ret);
|
||||
}
|
||||
if (0 == strcmp(key, kp->key)) {
|
||||
*kv = kp;
|
||||
} else {
|
||||
OBJ_RELEASE(kp);
|
||||
}
|
||||
found = false;
|
||||
cnt = 1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s pmix:native retrieved %s from server",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key);
|
||||
if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, id, kp))) {
|
||||
OPAL_ERROR_LOG(ret);
|
||||
}
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
if (0 == strcmp(key, kp->key)) {
|
||||
*kv = kp;
|
||||
found = true;
|
||||
} else {
|
||||
OBJ_RELEASE(kp);
|
||||
}
|
||||
OBJ_RELEASE(bptr);
|
||||
cnt = 1;
|
||||
}
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
} else {
|
||||
rc = OPAL_SUCCESS;
|
||||
}
|
||||
OBJ_RELEASE(bptr);
|
||||
cnt = 1;
|
||||
}
|
||||
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
} else {
|
||||
rc = ret;
|
||||
rc = OPAL_SUCCESS;
|
||||
}
|
||||
OBJ_RELEASE(cb);
|
||||
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
"%s pmix:native get completed",
|
||||
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
|
||||
|
||||
if (found) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
*kv = NULL;
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
rc = ret;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -79,6 +79,7 @@ typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata);
|
||||
|
||||
#define PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch
|
||||
#define PMIX_CREDENTIAL "pmix.cred" // (opal_byte_object*) security credential assigned to proc
|
||||
#define PMIX_HOSTNAME "pmix.hname" // (char*) name of the host this proc is on
|
||||
/* scratch directory locations for use by applications */
|
||||
#define PMIX_TMPDIR "pmix.tmpdir" // (char*) top-level tmp dir assigned to session
|
||||
/* information about relative ranks as assigned */
|
||||
|
@ -763,6 +763,23 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
|
||||
OBJ_RELEASE(reply);
|
||||
return;
|
||||
}
|
||||
/* always make sure to pass the hostname */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &proc->node->name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(reply);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
/* pack the blob */
|
||||
bptr = &buf;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(reply);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
/* pack the status */
|
||||
if (found) {
|
||||
ret = OPAL_SUCCESS;
|
||||
|
@ -616,7 +616,7 @@ static void process_message(pmix_server_peer_t *peer)
|
||||
int32_t cnt;
|
||||
pmix_cmd_t cmd;
|
||||
opal_buffer_t *reply, xfer, *bptr, buf;
|
||||
opal_value_t kv, *kvp, *kvp2;
|
||||
opal_value_t kv, *kvp, *kvp2, *kp;
|
||||
opal_identifier_t id, idreq;
|
||||
orte_process_name_t name;
|
||||
orte_job_t *jdata;
|
||||
@ -885,6 +885,12 @@ static void process_message(pmix_server_peer_t *peer)
|
||||
OBJ_DESTRUCT(&xfer);
|
||||
return;
|
||||
}
|
||||
/* regardless of where this proc is located, we need to ensure
|
||||
* that the hostname it is on is *always* returned. Otherwise,
|
||||
* the non-blocking fence operation will cause us to fail if
|
||||
* the number of procs is below the cutoff as we will immediately
|
||||
* attempt to retrieve the hostname for each proc, but they may
|
||||
* not have posted their data by that time */
|
||||
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_LOCAL)) {
|
||||
opal_output_verbose(2, pmix_server_output,
|
||||
"%s recvd GET PROC %s IS LOCAL",
|
||||
@ -913,6 +919,32 @@ static void process_message(pmix_server_peer_t *peer)
|
||||
OBJ_DESTRUCT(&xfer);
|
||||
return;
|
||||
}
|
||||
/* pass the hostname */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
OBJ_CONSTRUCT(&kv, opal_value_t);
|
||||
kv.key = strdup(PMIX_HOSTNAME);
|
||||
kv.type = OPAL_STRING;
|
||||
kv.data.string = strdup(proc->node->name);
|
||||
kp = &kv;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(reply);
|
||||
OBJ_DESTRUCT(&xfer);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
OBJ_DESTRUCT(&kv);
|
||||
return;
|
||||
}
|
||||
OBJ_DESTRUCT(&kv);
|
||||
/* pack the blob */
|
||||
bptr = &buf;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(reply);
|
||||
OBJ_DESTRUCT(&xfer);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
/* local blob */
|
||||
if (NULL != kvp) {
|
||||
opal_output_verbose(2, pmix_server_output,
|
||||
@ -1001,6 +1033,25 @@ static void process_message(pmix_server_peer_t *peer)
|
||||
* so don't repack them */
|
||||
opal_dss.copy_payload(reply, &buf);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
/* pass the hostname */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &proc->node->name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(reply);
|
||||
OBJ_DESTRUCT(&xfer);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
/* pack the blob */
|
||||
bptr = &buf;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(reply);
|
||||
OBJ_DESTRUCT(&xfer);
|
||||
OBJ_DESTRUCT(&buf);
|
||||
return;
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
PMIX_SERVER_QUEUE_SEND(peer, tag, reply);
|
||||
OBJ_DESTRUCT(&xfer);
|
||||
return;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user