usnic: fix bootstrap error paths
Fix previously-unfinished error paths during startup/bootstrapping. Instead of just blindly continuing on when an fi_* function call fails, opal_show_help and skip that device. Also, only check the usnic config minimums once. They're VIC-wide and won't change on a per-device basis -- we only need to check them once. Fixes CSCut19179.
Этот коммит содержится в:
родитель
0d80bfb391
Коммит
4b2cba46f4
@ -738,15 +738,29 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
|
|
||||||
ret = fi_fabric(info->fabric_attr, &fabric, NULL);
|
ret = fi_fabric(info->fabric_attr, &fabric, NULL);
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
BTL_ERROR(("fi_fabric"));
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
/* JMS error */
|
"libfabric API failed",
|
||||||
|
true,
|
||||||
|
opal_process_info.nodename,
|
||||||
|
info->fabric_attr->name,
|
||||||
|
"fi_fabric()", __FILE__, __LINE__,
|
||||||
|
ret,
|
||||||
|
strerror(-ret));
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
opal_memchecker_base_mem_defined(&fabric, sizeof(fabric));
|
opal_memchecker_base_mem_defined(&fabric, sizeof(fabric));
|
||||||
|
|
||||||
ret = fi_domain(fabric, info, &domain, NULL);
|
ret = fi_domain(fabric, info, &domain, NULL);
|
||||||
if (0 != ret) {
|
if (0 != ret) {
|
||||||
BTL_ERROR(("fi_domain"));
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
/* JMS error */
|
"libfabric API failed",
|
||||||
|
true,
|
||||||
|
opal_process_info.nodename,
|
||||||
|
info->fabric_attr->name,
|
||||||
|
"fi_domain()", __FILE__, __LINE__,
|
||||||
|
ret,
|
||||||
|
strerror(-ret));
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
opal_memchecker_base_mem_defined(&domain, sizeof(domain));
|
opal_memchecker_base_mem_defined(&domain, sizeof(domain));
|
||||||
|
|
||||||
@ -815,14 +829,21 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Check some usNIC configuration minimum settings */
|
/* The first time through, check some usNIC configuration
|
||||||
if (check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) {
|
minimum settings with information we got back from the fi_*
|
||||||
|
probes (these are VIC-wide settings -- they don't change
|
||||||
|
for each module we create, so we only need to check
|
||||||
|
once). */
|
||||||
|
if (0 == j &&
|
||||||
|
check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) {
|
||||||
opal_output_verbose(5, USNIC_OUT,
|
opal_output_verbose(5, USNIC_OUT,
|
||||||
"btl:usnic: device %s is not provisioned with enough resources -- skipping",
|
"btl:usnic: device %s is not provisioned with enough resources -- skipping",
|
||||||
info->fabric_attr->name);
|
info->fabric_attr->name);
|
||||||
fi_close(&domain->fid);
|
fi_close(&domain->fid);
|
||||||
fi_close(&fabric->fid);
|
fi_close(&fabric->fid);
|
||||||
continue;
|
|
||||||
|
mca_btl_usnic_component.num_modules = 0;
|
||||||
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*************************************************/
|
/*************************************************/
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user