1
1

Handle the non-blocking fence case correctly, and ensure we always at least pass back the hostname of the process whose info is being requested so that the ompi_proc_t can correctly initialize it when we are in a non-blocking fence with np < cutoff scenario

This commit was SVN r32578.
Этот коммит содержится в:
Ralph Castain 2014-08-22 14:26:24 +00:00
родитель 8f1b9b463e
Коммит 6ff2a60829
5 изменённых файлов: 110 добавлений и 22 удалений

Просмотреть файл

@ -50,6 +50,7 @@ BEGIN_C_DECLS
#define OPAL_DSTORE_NODE_SIZE PMIX_NODE_SIZE
#define OPAL_DSTORE_MAX_PROCS PMIX_MAX_PROCS
#define OPAL_DSTORE_NPROC_OFFSET PMIX_NPROC_OFFSET
#define OPAL_DSTORE_HOSTNAME PMIX_HOSTNAME
/* some OPAL-appropriate key definitions */
#define OPAL_DSTORE_LOCALITY "opal.locality" // (uint16_t) relative locality of a peer
@ -57,7 +58,6 @@ BEGIN_C_DECLS
#define OPAL_DSTORE_JOB_SDIR "opal.job.session.dir" // (char*) job-level session dir
#define OPAL_DSTORE_MY_SDIR "opal.my.session.dir" // (char*) session dir for this proc
#define OPAL_DSTORE_URI "opal.uri" // (char*) uri of specified proc
#define OPAL_DSTORE_HOSTNAME "opal.hostname" // (char*) hostname of specified proc
#define OPAL_DSTORE_ARCH "opal.arch" // (uint32_t) arch for specified proc
#define OPAL_DSTORE_HOSTID "opal.hostid" // (uint32_t) hostid of specified proc
#define OPAL_DSTORE_NODEID "opal.nodeid" // (uint32_t) nodeid of specified proc

Просмотреть файл

@ -660,6 +660,17 @@ static int native_fence_nb(opal_process_name_t *procs, size_t nprocs,
return rc;
}
}
/* provide our URI */
if (OPAL_SUCCESS != (rc = opal_dss.pack(msg, &local_uri, 1, OPAL_STRING))) {
OPAL_ERROR_LOG(rc);
OBJ_RELEASE(msg);
return rc;
}
/* only do it once */
if (NULL != local_uri) {
free(local_uri);
local_uri = NULL;
}
/* if we haven't already done it, ensure we have committed our values */
if (NULL != mca_pmix_native_component.cache_local) {
@ -729,6 +740,7 @@ static int native_get(const opal_identifier_t *id,
int32_t cnt;
opal_list_t vals;
opal_value_t *kp;
bool found;
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:native getting value for proc %s key %s",
@ -789,39 +801,46 @@ static int native_get(const opal_identifier_t *id,
OBJ_RELEASE(cb);
return rc;
}
if (OPAL_SUCCESS == ret) {
cnt = 1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, id, kp))) {
OPAL_ERROR_LOG(ret);
}
if (0 == strcmp(key, kp->key)) {
*kv = kp;
} else {
OBJ_RELEASE(kp);
}
found = false;
cnt = 1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:native retrieved %s from server",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key);
if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, id, kp))) {
OPAL_ERROR_LOG(ret);
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
OPAL_ERROR_LOG(rc);
if (0 == strcmp(key, kp->key)) {
*kv = kp;
found = true;
} else {
OBJ_RELEASE(kp);
}
OBJ_RELEASE(bptr);
cnt = 1;
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
OPAL_ERROR_LOG(rc);
} else {
rc = OPAL_SUCCESS;
}
OBJ_RELEASE(bptr);
cnt = 1;
}
if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
OPAL_ERROR_LOG(rc);
} else {
rc = ret;
rc = OPAL_SUCCESS;
}
OBJ_RELEASE(cb);
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:native get completed",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
if (found) {
return OPAL_SUCCESS;
}
*kv = NULL;
if (OPAL_SUCCESS == rc) {
rc = ret;
}
return rc;
}

Просмотреть файл

@ -79,6 +79,7 @@ typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata);
#define PMIX_CPUSET "pmix.cpuset" // (char*) hwloc bitmap applied to proc upon launch
#define PMIX_CREDENTIAL "pmix.cred" // (opal_byte_object*) security credential assigned to proc
#define PMIX_HOSTNAME "pmix.hname" // (char*) name of the host this proc is on
/* scratch directory locations for use by applications */
#define PMIX_TMPDIR "pmix.tmpdir" // (char*) top-level tmp dir assigned to session
/* information about relative ranks as assigned */

Просмотреть файл

@ -763,6 +763,23 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(reply);
return;
}
/* always make sure to pass the hostname */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &proc->node->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&buf);
return;
}
/* pack the blob */
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_DESTRUCT(&buf);
/* pack the status */
if (found) {
ret = OPAL_SUCCESS;

Просмотреть файл

@ -616,7 +616,7 @@ static void process_message(pmix_server_peer_t *peer)
int32_t cnt;
pmix_cmd_t cmd;
opal_buffer_t *reply, xfer, *bptr, buf;
opal_value_t kv, *kvp, *kvp2;
opal_value_t kv, *kvp, *kvp2, *kp;
opal_identifier_t id, idreq;
orte_process_name_t name;
orte_job_t *jdata;
@ -885,6 +885,12 @@ static void process_message(pmix_server_peer_t *peer)
OBJ_DESTRUCT(&xfer);
return;
}
/* regardless of where this proc is located, we need to ensure
* that the hostname it is on is *always* returned. Otherwise,
* the non-blocking fence operation will cause us to fail if
* the number of procs is below the cutoff as we will immediately
* attempt to retrieve the hostname for each proc, but they may
* not have posted their data by that time */
if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_LOCAL)) {
opal_output_verbose(2, pmix_server_output,
"%s recvd GET PROC %s IS LOCAL",
@ -913,6 +919,32 @@ static void process_message(pmix_server_peer_t *peer)
OBJ_DESTRUCT(&xfer);
return;
}
/* pass the hostname */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(PMIX_HOSTNAME);
kv.type = OPAL_STRING;
kv.data.string = strdup(proc->node->name);
kp = &kv;
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&xfer);
OBJ_DESTRUCT(&buf);
OBJ_DESTRUCT(&kv);
return;
}
OBJ_DESTRUCT(&kv);
/* pack the blob */
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&xfer);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_DESTRUCT(&buf);
/* local blob */
if (NULL != kvp) {
opal_output_verbose(2, pmix_server_output,
@ -1001,6 +1033,25 @@ static void process_message(pmix_server_peer_t *peer)
* so don't repack them */
opal_dss.copy_payload(reply, &buf);
OBJ_DESTRUCT(&buf);
/* pass the hostname */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &proc->node->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&xfer);
OBJ_DESTRUCT(&buf);
return;
}
/* pack the blob */
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&xfer);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_DESTRUCT(&buf);
PMIX_SERVER_QUEUE_SEND(peer, tag, reply);
OBJ_DESTRUCT(&xfer);
return;