1
1

Fix the "unreachable" message so it outputs the correct hostname for the remote proc. Cleanup some of the pmix stuff when running corner cases of errors

This commit was SVN r32584.
Этот коммит содержится в:
Ralph Castain 2014-08-22 19:20:45 +00:00
родитель 97abb7c727
Коммит b1a7375192
4 изменённых файлов: 59 добавлений и 30 удалений

Просмотреть файл

@ -413,8 +413,8 @@ static int mca_bml_r2_add_procs( size_t nprocs,
(NULL != ompi_proc_local_proc->super.proc_hostname ?
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
OMPI_NAME_PRINT(&(proc->super.proc_name)),
(NULL != ompi_proc_local_proc->super.proc_hostname ?
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
(NULL != proc->super.proc_hostname ?
proc->super.proc_hostname : "unknown!"),
btl_names);
}
break;

Просмотреть файл

@ -806,8 +806,10 @@ static int native_get(const opal_identifier_t *id,
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s pmix:native retrieved %s from server",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key);
"%s pmix:native retrieved %s (%s) from server for proc %s",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key,
(OPAL_STRING == kp->type) ? kp->data.string : "NS",
OPAL_NAME_PRINT(*id));
if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, id, kp))) {
OPAL_ERROR_LOG(ret);
}
@ -984,6 +986,9 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
#endif
/* if this is the local cpuset blob, then unpack and store its contents */
if (0 == strcmp(PMIX_LOCAL_CPUSETS, kp->key)) {
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s received local cpusets",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
/* transfer the byte object for unpacking */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
@ -999,6 +1004,11 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
cnt = 1;
continue;
}
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
"%s saving cpuset %s for local peer %s",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
(NULL == cpuset) ? "NULL" : cpuset,
OPAL_NAME_PRINT(id));
OBJ_CONSTRUCT(&kvn, opal_value_t);
kvn.key = strdup(OPAL_DSTORE_CPUSET);
kvn.type = OPAL_STRING;
@ -1060,7 +1070,7 @@ static bool native_get_attr(const char *attr, opal_value_t **kv)
/* if the list of local peers wasn't included, then we are done */
if (NULL == lclpeers) {
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
opal_output_verbose(0, opal_pmix_base_framework.framework_output,
"%s no local peers reported",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
return found;

Просмотреть файл

@ -708,7 +708,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
int rc, ret;
int32_t cnt;
opal_buffer_t *reply, *bptr, buf;
opal_value_t *kvp, *kvp2;
opal_value_t *kvp, *kvp2, kv, *kp;
opal_identifier_t idreq;
orte_process_name_t name;
orte_job_t *jdata;
@ -763,23 +763,6 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(reply);
return;
}
/* always make sure to pass the hostname */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &proc->node->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&buf);
return;
}
/* pack the blob */
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_DESTRUCT(&buf);
/* pack the status */
if (found) {
ret = OPAL_SUCCESS;
@ -791,6 +774,30 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(reply);
return;
}
/* always pass the hostname */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(PMIX_HOSTNAME);
kv.type = OPAL_STRING;
kv.data.string = strdup(orte_process_info.nodename);
kp = &kv;
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&buf);
OBJ_DESTRUCT(&kv);
return;
}
OBJ_DESTRUCT(&kv);
/* pack the blob */
bptr = &buf;
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_DESTRUCT(&buf);
/* remote blob */
if (NULL != kvp) {
opal_output_verbose(2, pmix_server_output,
@ -852,7 +859,7 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
pmix_server_dmx_req_t *req, *nxt;
int rc, ret;
int32_t cnt;
opal_buffer_t *reply, xfer;
opal_buffer_t *reply, xfer, *bptr;
opal_identifier_t target;
opal_value_t kv;
@ -875,14 +882,30 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
return;
}
/* unpack the hostname blob */
cnt = 1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
return;
}
/* prep the reply */
reply = OBJ_NEW(opal_buffer_t);
/* pack the returned status */
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &ret, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_RELEASE(bptr);
return;
}
/* pack the hostname blob */
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_RELEASE(bptr);
return;
}
OBJ_RELEASE(bptr);
/* pass across any returned blobs */
opal_dss.copy_payload(reply, buffer);

Просмотреть файл

@ -478,7 +478,7 @@ static int stuff_proc_values(opal_buffer_t *reply, orte_job_t *jdata, orte_proc_
tmp = NULL;
if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
/* add the name of the proc */
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, (opal_identifier_t*)&name, 1, OPAL_UINT64))) {
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, (opal_identifier_t*)&pptr->name, 1, OPAL_UINT64))) {
ORTE_ERROR_LOG(rc);
opal_argv_free(list);
return rc;
@ -924,7 +924,7 @@ static void process_message(pmix_server_peer_t *peer)
OBJ_CONSTRUCT(&kv, opal_value_t);
kv.key = strdup(PMIX_HOSTNAME);
kv.type = OPAL_STRING;
kv.data.string = strdup(proc->node->name);
kv.data.string = strdup(orte_process_info.nodename);
kp = &kv;
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &kp, 1, OPAL_VALUE))) {
ORTE_ERROR_LOG(rc);
@ -1025,7 +1025,6 @@ static void process_message(pmix_server_peer_t *peer)
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &ret, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&xfer);
OBJ_DESTRUCT(&buf);
return;
}
@ -1038,7 +1037,6 @@ static void process_message(pmix_server_peer_t *peer)
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &proc->node->name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&xfer);
OBJ_DESTRUCT(&buf);
return;
}
@ -1047,13 +1045,11 @@ static void process_message(pmix_server_peer_t *peer)
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &bptr, 1, OPAL_BUFFER))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
OBJ_DESTRUCT(&xfer);
OBJ_DESTRUCT(&buf);
return;
}
OBJ_DESTRUCT(&buf);
PMIX_SERVER_QUEUE_SEND(peer, tag, reply);
OBJ_DESTRUCT(&xfer);
return;
}
OPAL_LIST_DESTRUCT(&values);