From 4b2cba46f467ba49b3b0b36e4c4864ecc58b1ae6 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 9 Mar 2015 16:57:41 -0700 Subject: [PATCH] usnic: fix bootstrap error paths Fix previously-unfinished error paths during startup/bootstrapping. Instead of just blindly continuing on when an fi_* function call fails, opal_show_help and skip that device. Also, only check the usnic config minimums once. They're VIC-wide and won't change on a per-device basis -- we only need to check them once. Fixes CSCut19179. --- opal/mca/btl/usnic/btl_usnic_component.c | 35 +++++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 031b248960..5197881c45 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -738,15 +738,29 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, ret = fi_fabric(info->fabric_attr, &fabric, NULL); if (0 != ret) { - BTL_ERROR(("fi_fabric")); - /* JMS error */ + opal_show_help("help-mpi-btl-usnic.txt", + "libfabric API failed", + true, + opal_process_info.nodename, + info->fabric_attr->name, + "fi_fabric()", __FILE__, __LINE__, + ret, + strerror(-ret)); + continue; } opal_memchecker_base_mem_defined(&fabric, sizeof(fabric)); ret = fi_domain(fabric, info, &domain, NULL); if (0 != ret) { - BTL_ERROR(("fi_domain")); - /* JMS error */ + opal_show_help("help-mpi-btl-usnic.txt", + "libfabric API failed", + true, + opal_process_info.nodename, + info->fabric_attr->name, + "fi_domain()", __FILE__, __LINE__, + ret, + strerror(-ret)); + continue; } opal_memchecker_base_mem_defined(&domain, sizeof(domain)); @@ -815,14 +829,21 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, } } - /* Check some usNIC configuration minimum settings */ - if (check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) { + /* The first time through, check some usNIC configuration + minimum settings with information we got back from the fi_* + probes (these are VIC-wide settings -- they don't change + for each module we create, so we only need to check + once). */ + if (0 == j && + check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s is not provisioned with enough resources -- skipping", info->fabric_attr->name); fi_close(&domain->fid); fi_close(&fabric->fid); - continue; + + mca_btl_usnic_component.num_modules = 0; + goto error; } /*************************************************/