1
1

Just because the openib BTL can't reach a process doesn't mean it is a job-ending error. If we have other methods for reaching the process (e.g., sm for a local proc), then that's okay. If there is no method for reaching a proc, then that's an error - but the BML will report that situation.

The question of whether or not the openib BTL supports loopback is a separate question. It may be more appropriate to make the modex be PMIX_GLOBAL for cases where openib can support loopback so someone can run without a shared memory component. I'll leave that decision to the IB vendors.

This commit was SVN r32702.
Этот коммит содержится в:
Ralph Castain 2014-09-10 17:02:16 +00:00
родитель ea11e63f59
Коммит a7c5b77d70
3 изменённых файлов: 8 добавлений и 6 удалений

Просмотреть файл

@ -974,7 +974,10 @@ int mca_btl_openib_add_procs(
#endif
if(NULL == (ib_proc = mca_btl_openib_proc_create(proc))) {
return OPAL_ERR_OUT_OF_RESOURCE;
/* if we don't have connection info for this process, it's
* okay because some other method might be able to reach it,
* so just mark it as unreachable by us */
continue;
}
/* check if the remote proc has any ports that:

Просмотреть файл

@ -150,7 +150,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(opal_proc_t* proc)
OPAL_MODEX_RECV(rc, &mca_btl_openib_component.super.btl_version,
proc, &message, &msg_size);
if (OPAL_SUCCESS != rc) {
BTL_ERROR(("[%s:%d] opal_modex_recv failed for peer %s",
BTL_VERBOSE(("[%s:%d] opal_modex_recv failed for peer %s",
__FILE__, __LINE__,
OPAL_NAME_PRINT(proc->proc_name)));
OBJ_RELEASE(module_proc);

Просмотреть файл

@ -199,10 +199,9 @@ typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata);
#define OPAL_MODEX_RECV_STRING(r, s, p, d, sz) \
do { \
opal_value_t *kv; \
if (OPAL_SUCCESS != ((r) = opal_pmix.get(&(p)->proc_name, \
(s), &kv))) { \
OPAL_ERROR_LOG((r)); \
} else { \
if (OPAL_SUCCESS == ((r) = opal_pmix.get(&(p)->proc_name, \
(s), &kv)) && \
NULL != kv) { \
*(d) = kv->data.bo.bytes; \
*(sz) = kv->data.bo.size; \
kv->data.bo.bytes = NULL; /* protect the data */ \