1
1

usnic: do not disqualify if a peer does not put usnic modex info

If ompi_modex_recv() fails with OPAL_ERR_DATA_VALUE_NOT_FOUND, it
simply means that the peer process did not put any usnic BTL modex
info -- it is not an error.  So have the usnic BTL simply ignore that
peer (vs. disqualifying itself / treating this like a real error).

Refs trac:4442.

This commit was SVN r31258.

The following Trac tickets were found above:
  Ticket 4442 --> https://svn.open-mpi.org/trac/ompi/ticket/4442
Этот коммит содержится в:
Jeff Squyres 2014-03-27 19:37:07 +00:00
родитель b3bb90cf2d
Коммит cdb396697c
2 изменённых файлов: 31 добавлений и 16 удалений

Просмотреть файл

@ -198,7 +198,11 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
to reach this destination. */
usnic_proc = NULL;
rc = ompi_btl_usnic_proc_match(ompi_proc, module, &usnic_proc);
if (OMPI_SUCCESS != rc) {
if (OMPI_ERR_UNREACH == rc) {
/* If the peer doesn't have usnic modex info, then we just
skip it */
continue;
} else if (OMPI_SUCCESS != rc) {
return OMPI_ERR_OUT_OF_RESOURCE;
}

Просмотреть файл

@ -162,17 +162,23 @@ ompi_btl_usnic_proc_lookup_endpoint(ompi_btl_usnic_module_t *receiver,
/*
* Create an ompi_btl_usnic_proc_t and initialize it with modex info
* and an empty array of endpoints.
*
* Returns OMPI_ERR_UNREACH if we can't reach the peer (i.e., we can't
* find their modex data).
*/
static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
static int create_proc(ompi_proc_t *ompi_proc,
ompi_btl_usnic_proc_t **usnic_proc)
{
ompi_btl_usnic_proc_t *proc = NULL;
size_t size;
int rc;
*usnic_proc = NULL;
/* Create the proc if it doesn't already exist */
proc = OBJ_NEW(ompi_btl_usnic_proc_t);
if (NULL == proc) {
return NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Initialize number of peers */
@ -184,15 +190,22 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
ompi_proc, (void*)&proc->proc_modex,
&size);
if (OMPI_SUCCESS != rc) {
opal_show_help("help-mpi-btl-usnic.txt", "internal error during init",
/* If this proc simply doesn't have this key, then they're not
running the usnic BTL -- just ignore them. Otherwise, show an
error message. */
if (OPAL_ERR_DATA_VALUE_NOT_FOUND == rc) {
OBJ_RELEASE(proc);
return OMPI_ERR_UNREACH;
} else if (OMPI_SUCCESS != rc) {
opal_show_help("help-mpi-btl-usnic.txt",
"internal error during init",
true,
ompi_process_info.nodename,
"<none>", 0,
"ompi_modex_recv() failed", __FILE__, __LINE__,
opal_strerror(rc));
OBJ_RELEASE(proc);
return NULL;
return OMPI_ERROR;
}
if ((size % sizeof(ompi_btl_usnic_addr_t)) != 0) {
@ -210,14 +223,14 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
msg);
OBJ_RELEASE(proc);
return NULL;
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
}
proc->proc_modex_count = size / sizeof(ompi_btl_usnic_addr_t);
if (0 == proc->proc_modex_count) {
proc->proc_endpoints = NULL;
OBJ_RELEASE(proc);
return NULL;
return OMPI_ERR_UNREACH;
}
proc->proc_modex_claimed = (bool*)
@ -225,7 +238,7 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
if (NULL == proc->proc_modex_claimed) {
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(proc);
return NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
proc->proc_endpoints = (mca_btl_base_endpoint_t**)
@ -233,10 +246,11 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
if (NULL == proc->proc_endpoints) {
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(proc);
return NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
return proc;
*usnic_proc = proc;
return OMPI_SUCCESS;
}
/* Compare the addresses of the local interface corresponding to module and the
@ -705,12 +719,9 @@ int ompi_btl_usnic_proc_match(ompi_proc_t *ompi_proc,
*proc = ompi_btl_usnic_proc_lookup_ompi(ompi_proc);
if (*proc != NULL) {
OBJ_RETAIN(*proc);
return OMPI_SUCCESS;
} else {
/* If not, go make one */
*proc = create_proc(ompi_proc);
if (NULL == *proc) {
return OMPI_ERR_NOT_FOUND;
}
return create_proc(ompi_proc, proc);
}
return OMPI_SUCCESS;
}