Just because the openib BTL can't reach a process doesn't mean it is a job-ending error. If we have other methods for reaching the process (e.g., sm for a local proc), then that's okay. If there is no method for reaching a proc, then that's an error - but the BML will report that situation.
The question of whether or not the openib BTL supports loopback is a separate question. It may be more appropriate to make the modex be PMIX_GLOBAL for cases where openib can support loopback so someone can run without a shared memory component. I'll leave that decision to the IB vendors. This commit was SVN r32702.
Этот коммит содержится в:
родитель
ea11e63f59
Коммит
a7c5b77d70
@ -974,7 +974,10 @@ int mca_btl_openib_add_procs(
|
||||
#endif
|
||||
|
||||
if(NULL == (ib_proc = mca_btl_openib_proc_create(proc))) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
/* if we don't have connection info for this process, it's
|
||||
* okay because some other method might be able to reach it,
|
||||
* so just mark it as unreachable by us */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check if the remote proc has any ports that:
|
||||
|
@ -150,7 +150,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(opal_proc_t* proc)
|
||||
OPAL_MODEX_RECV(rc, &mca_btl_openib_component.super.btl_version,
|
||||
proc, &message, &msg_size);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_ERROR(("[%s:%d] opal_modex_recv failed for peer %s",
|
||||
BTL_VERBOSE(("[%s:%d] opal_modex_recv failed for peer %s",
|
||||
__FILE__, __LINE__,
|
||||
OPAL_NAME_PRINT(proc->proc_name)));
|
||||
OBJ_RELEASE(module_proc);
|
||||
|
@ -199,10 +199,9 @@ typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata);
|
||||
#define OPAL_MODEX_RECV_STRING(r, s, p, d, sz) \
|
||||
do { \
|
||||
opal_value_t *kv; \
|
||||
if (OPAL_SUCCESS != ((r) = opal_pmix.get(&(p)->proc_name, \
|
||||
(s), &kv))) { \
|
||||
OPAL_ERROR_LOG((r)); \
|
||||
} else { \
|
||||
if (OPAL_SUCCESS == ((r) = opal_pmix.get(&(p)->proc_name, \
|
||||
(s), &kv)) && \
|
||||
NULL != kv) { \
|
||||
*(d) = kv->data.bo.bytes; \
|
||||
*(sz) = kv->data.bo.size; \
|
||||
kv->data.bo.bytes = NULL; /* protect the data */ \
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user