From dc18c324371eb9041e3276a0b8cdc5d7ad06e81d Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 22 Apr 2016 15:55:32 -0700 Subject: [PATCH] usnic: fix resource check The math for checking the number of QPs and CQs per usNIC/VF was incorrect, allowing you to run MPI processes even when usNICs (i.e., VIC VFs) had fewer QPs and CQs than were necessary. This led to a confusing error later when fi_enable(3) failed (because we lazily create QPs). Fixing the math here ensure that we actually print a helpful error message telling the user specifically what is wrong. Signed-off-by: Jeff Squyres --- opal/mca/btl/usnic/btl_usnic_component.c | 22 ++++++++++------------ opal/mca/btl/usnic/help-mpi-btl-usnic.txt | 2 +- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 418d59b298..07803ce9ee 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -337,11 +337,11 @@ static int check_usnic_config(opal_btl_usnic_module_t *module, 1. num_vfs (i.e., "usNICs") >= num_local_procs (to ensure that each MPI process will be able to have its own protection domain), and - 2. num_vfs * num_qps_per_vf >= num_local_procs * NUM_CHANNELS + 2. num_qps_per_vf >= NUM_CHANNELS (to ensure that each MPI process will be able to get the number of QPs it needs -- we know that every VF will have the same number of QPs), and - 3. num_vfs * num_cqs_per_vf >= num_local_procs * NUM_CHANNELS + 3. num_cqs_per_vf >= NUM_CHANNELS (to ensure that each MPI process will be able to get the number of CQs that it needs) */ if (uip->ui.v1.ui_num_vf < unlp) { @@ -350,19 +350,17 @@ static int check_usnic_config(opal_btl_usnic_module_t *module, goto error; } - if (uip->ui.v1.ui_num_vf * uip->ui.v1.ui_qp_per_vf < - unlp * USNIC_NUM_CHANNELS) { - snprintf(str, sizeof(str), "Not enough WQ/RQ (found %d, need %d)", - uip->ui.v1.ui_num_vf * uip->ui.v1.ui_qp_per_vf, - unlp * USNIC_NUM_CHANNELS); + if (uip->ui.v1.ui_qp_per_vf < USNIC_NUM_CHANNELS) { + snprintf(str, sizeof(str), "Not enough transmit/receive queues per usNIC (found %d, need %d)", + uip->ui.v1.ui_qp_per_vf, + USNIC_NUM_CHANNELS); goto error; } - if (uip->ui.v1.ui_num_vf * uip->ui.v1.ui_cq_per_vf < - unlp * USNIC_NUM_CHANNELS) { + if (uip->ui.v1.ui_cq_per_vf < USNIC_NUM_CHANNELS) { snprintf(str, sizeof(str), - "Not enough CQ per usNIC (found %d, need %d)", - uip->ui.v1.ui_num_vf * uip->ui.v1.ui_cq_per_vf, - unlp * USNIC_NUM_CHANNELS); + "Not enough completion queues per usNIC (found %d, need %d)", + uip->ui.v1.ui_cq_per_vf, + USNIC_NUM_CHANNELS); goto error; } diff --git a/opal/mca/btl/usnic/help-mpi-btl-usnic.txt b/opal/mca/btl/usnic/help-mpi-btl-usnic.txt index 055b0954b7..a10a905a06 100644 --- a/opal/mca/btl/usnic/help-mpi-btl-usnic.txt +++ b/opal/mca/btl/usnic/help-mpi-btl-usnic.txt @@ -18,7 +18,7 @@ This means that you have either not provisioned enough usNICs on this VIC, or there are not enough total receive, transmit, or completion queues on the provisioned usNICs. On each VIC in a given server, you need to provision at least as many usNICs as MPI processes on that -server. In each usNIC, you need to provision at least two each of the +server. In each usNIC, you need to provision enough of each of the following: send queues, receive queues, and completion queues. Open MPI will skip this usNIC interface in the usnic BTL, which may