From 64f61ebd0797bd265a9e1bbd9041a885500dc403 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 20 May 2008 21:53:42 +0000 Subject: [PATCH] Fixes trac:1285. Really. This commit has the same commit message as r18450, but without the extra bonus memory corruption that was introduced. This commit was SVN r18467. The following SVN revision numbers were found above: r18450 --> open-mpi/ompi@5295902ebec36d49c92edcd3631978d822a107cc The following Trac tickets were found above: Ticket 1285 --> https://svn.open-mpi.org/trac/ompi/ticket/1285 --- ompi/mca/btl/openib/btl_openib.h | 12 + ompi/mca/btl/openib/btl_openib_component.c | 370 ++++++++++++++++-- ompi/mca/btl/openib/btl_openib_ini.c | 41 +- ompi/mca/btl/openib/btl_openib_ini.h | 2 + ompi/mca/btl/openib/btl_openib_mca.c | 260 ++---------- ompi/mca/btl/openib/help-mpi-btl-openib.txt | 23 ++ .../btl/openib/mca-btl-openib-hca-params.ini | 1 + 7 files changed, 437 insertions(+), 272 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index fef14d1583..06441f5a8c 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -110,6 +110,13 @@ struct mca_btl_openib_qp_info_t { #define BTL_OPENIB_QP_TYPE_XRC(Q) \ (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP) +typedef enum { + BTL_OPENIB_RQ_SOURCE_DEFAULT, + BTL_OPENIB_RQ_SOURCE_MCA, + BTL_OPENIB_RQ_SOURCE_HCA_INI, + BTL_OPENIB_RQ_SOURCE_HCA_MAX +} btl_openib_receive_queues_source_t; + struct mca_btl_openib_component_t { mca_btl_base_component_1_0_1_t super; /**< base BTL component */ @@ -197,6 +204,11 @@ struct mca_btl_openib_component_t { char *if_exclude; char **if_exclude_list; + /* MCA param btl_openib_receive_queues */ + char *receive_queues; + /* Whether we got a non-default value of btl_openib_receive_queues */ + btl_openib_receive_queues_source_t receive_queues_source; + /** Colon-delimited list of filenames for HCA parameters */ char *hca_params_file_names; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 7d4799563a..02b85f8a49 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -42,6 +42,7 @@ #include "opal/mca/carto/carto.h" #include "opal/mca/carto/base/base.h" #include "opal/mca/paffinity/base/base.h" +#include "opal/mca/installdirs/installdirs.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/proc_info.h" @@ -80,6 +81,11 @@ static int btl_openib_component_close(void); static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool); static int btl_openib_component_progress(void); +/* + * Local variables + */ +static mca_btl_openib_hca_t *receive_queues_hca = NULL; + mca_btl_openib_component_t mca_btl_openib_component = { { /* First, the mca_base_component_t struct containing meta information @@ -149,6 +155,9 @@ static int btl_openib_component_close(void) ompi_btl_openib_connect_base_finalize(); ompi_btl_openib_fd_finalize(); ompi_btl_openib_ini_finalize(); + if (NULL != mca_btl_openib_component.receive_queues) { + free(mca_btl_openib_component.receive_queues); + } return OMPI_SUCCESS; } @@ -664,8 +673,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca, static void hca_construct(mca_btl_openib_hca_t *hca) { - int i; - hca->ib_dev = NULL; hca->ib_dev_context = NULL; hca->ib_pd = NULL; @@ -687,13 +694,8 @@ static void hca_construct(mca_btl_openib_hca_t *hca) #if HAVE_XRC hca->xrc_fd = -1; #endif - hca->qps = (mca_btl_openib_hca_qp_t*)calloc(mca_btl_openib_component.num_qps, - sizeof(mca_btl_openib_hca_qp_t)); + hca->qps = NULL; OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t); - for(i = 0; i < mca_btl_openib_component.num_qps; i++) { - OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t); - OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t); - } OBJ_CONSTRUCT(&hca->send_free_control, ompi_free_list_t); } @@ -709,13 +711,14 @@ static void hca_destruct(mca_btl_openib_hca_t *hca) free(hca->eager_rdma_buffers); } OBJ_DESTRUCT(&hca->hca_lock); - for(i = 0; i < mca_btl_openib_component.num_qps; i++) { - OBJ_DESTRUCT(&hca->qps[i].send_free); - OBJ_DESTRUCT(&hca->qps[i].recv_free); - } OBJ_DESTRUCT(&hca->send_free_control); - if(hca->qps) + if (NULL != hca->qps) { + for (i = 0; i < mca_btl_openib_component.num_qps; i++) { + OBJ_DESTRUCT(&hca->qps[i].send_free); + OBJ_DESTRUCT(&hca->qps[i].recv_free); + } free(hca->qps); + } } OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct, @@ -947,6 +950,9 @@ done: return num_ports; } +/* + * Prefer values that are already in the target + */ static void merge_values(ompi_btl_openib_ini_values_t *target, ompi_btl_openib_ini_values_t *src) { @@ -959,6 +965,10 @@ static void merge_values(ompi_btl_openib_ini_values_t *target, target->use_eager_rdma = src->use_eager_rdma; target->use_eager_rdma_set = true; } + + if (NULL == target->receive_queues && NULL != src->receive_queues) { + target->receive_queues = strdup(src->receive_queues); + } } static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag) @@ -969,6 +979,15 @@ static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag) (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type); } +static int32_t atoi_param(char *param, int32_t dflt) +{ + if (NULL == param || '\0' == param[0]) { + return dflt ? dflt : 1; + } + + return atoi(param); +} + static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid) { int index; @@ -985,6 +1004,206 @@ static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid) } } +static int setup_qps(void) +{ + char **queues, **params = NULL; + int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0; + uint32_t max_qp_size, max_size_needed; + int32_t min_freelist_size = 0; + int smallest_pp_qp = 0, ret = OMPI_ERROR; + + queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':'); + if (0 == opal_argv_count(queues)) { + orte_show_help("help-mpi-btl-openib.txt", + "no qps in receive_queues", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + ret = OMPI_ERROR; + goto error; + } + + while (queues[qp] != NULL) { + if (0 == strncmp("P,", queues[qp], 2)) { + num_pp_qps++; + if (smallest_pp_qp > qp) { + smallest_pp_qp = qp; + } + } else if (0 == strncmp("S,", queues[qp], 2)) { + num_srq_qps++; + } else if (0 == strncmp("X,", queues[qp], 2)) { +#if HAVE_XRC + num_xrc_qps++; +#else + orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + ret = OMPI_ERR_RESOURCE_UNAVAILABLE; + goto error; +#endif + } else { + orte_show_help("help-mpi-btl-openib.txt", + "invalid qp type in receive_queues", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues, + queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + qp++; + } + /* Current XRC implementation can't used with other QP types - PP + and SRQ */ + if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) { + orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + + /* Current XRC implementation can't used with btls_per_lid > 1 */ + if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) { + orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", + true, orte_process_info.nodename, + mca_btl_openib_component.receive_queues, num_xrc_qps); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.num_pp_qps = num_pp_qps; + mca_btl_openib_component.num_srq_qps = num_srq_qps; + mca_btl_openib_component.num_xrc_qps = num_xrc_qps; + mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps; + + mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*) + malloc(sizeof(mca_btl_openib_qp_info_t) * + mca_btl_openib_component.num_qps); + + qp = 0; +#define P(N) (((N) > count) ? NULL : params[(N)]) + while (queues[qp] != NULL) { + int count; + int32_t rd_low, rd_num; + params = opal_argv_split_with_empty(queues[qp], ','); + count = opal_argv_count(params); + + if ('P' == params[0][0]) { + int32_t rd_win, rd_rsv; + if (count < 3 || count > 6) { + orte_show_help("help-mpi-btl-openib.txt", + "invalid pp qp specification", true, + orte_process_info.nodename, queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP; + mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); + rd_num = atoi_param(P(2), 256); + /* by default set rd_low to be 3/4 of rd_num */ + rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); + rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); + rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); + + BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d", + rd_num, rd_low, rd_win, rd_rsv)); + + /* Calculate the smallest freelist size that can be allowed */ + if (rd_num + rd_rsv > min_freelist_size) { + min_freelist_size = rd_num + rd_rsv; + } + + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; + if ((rd_num - rd_low) > rd_win) { + orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", + true, rd_win, rd_num - rd_low); + } + } else { + int32_t sd_max; + if (count < 3 || count > 5) { + orte_show_help("help-mpi-btl-openib.txt", + "invalid srq specification", true, + orte_process_info.nodename, queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ? + MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP; + mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); + rd_num = atoi_param(P(2), 256); + /* by default set rd_low to be 3/4 of rd_num */ + rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); + sd_max = atoi_param(P(4), rd_low / 4); + BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d", + rd_num, rd_low, sd_max)); + + /* Calculate the smallest freelist size that can be allowed */ + if (rd_num > min_freelist_size) { + min_freelist_size = rd_num; + } + + mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; + } + + if (rd_num <= rd_low) { + orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", + true, orte_process_info.nodename, queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.qp_infos[qp].rd_num = rd_num; + mca_btl_openib_component.qp_infos[qp].rd_low = rd_low; + opal_argv_free(params); + qp++; + } + params = NULL; + + /* Sanity check some sizes */ + + max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size; + max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > + mca_btl_openib_module.super.btl_max_send_size) ? + mca_btl_openib_module.super.btl_eager_limit : + mca_btl_openib_module.super.btl_max_send_size; + if (max_qp_size < max_size_needed) { + orte_show_help("help-mpi-btl-openib.txt", + "biggest qp size is too small", true, + orte_process_info.nodename, max_qp_size, + max_size_needed); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } else if (max_qp_size > max_size_needed) { + orte_show_help("help-mpi-btl-openib.txt", + "biggest qp size is too big", true, + orte_process_info.nodename, max_qp_size, + max_size_needed); + } + + if (mca_btl_openib_component.ib_free_list_max > 0 && + min_freelist_size > mca_btl_openib_component.ib_free_list_max) { + orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true, + orte_process_info.nodename, + mca_btl_openib_component.ib_free_list_max, + min_freelist_size); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + + mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1; + mca_btl_openib_component.credits_qp = smallest_pp_qp; + + ret = OMPI_SUCCESS; +error: + if (NULL != params) { + opal_argv_free(params); + } + + if (NULL != queues) { + opal_argv_free(queues); + } + + return ret; +} + static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) { struct mca_mpool_base_resources_t mpool_resources; @@ -1023,26 +1242,12 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) /* If mca_btl_if_include/exclude were specified, get usable ports */ allowed_ports = (int*)malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int)); port_cnt = get_port_list(hca, allowed_ports); - if(0 == port_cnt) { - ret = OMPI_SUCCESS; + if (0 == port_cnt) { free(allowed_ports); - goto error; - } -#if HAVE_XRC - /* if user configured to run with XRC qp and the device don't support it - - * we should ignore this hca. Maybe we have other one that have XRC support - */ - if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) && - mca_btl_openib_component.num_xrc_qps > 0) { - orte_show_help("help-mpi-btl-openib.txt", - "XRC on device without XRC support", true, - mca_btl_openib_component.num_xrc_qps, - ibv_get_device_name(hca->ib_dev), - orte_process_info.nodename); ret = OMPI_SUCCESS; goto error; } -#endif + /* Load in vendor/part-specific HCA parameters. Note that even if we don't find values for this vendor/part, "values" will be set indicating that it does not have good values */ @@ -1102,11 +1307,67 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) hca->mtu = mca_btl_openib_component.ib_mtu; } + /* If the user specified btl_openib_receive_queues MCA param, it + overrides all HCA INI params */ + if (BTL_OPENIB_RQ_SOURCE_MCA != + mca_btl_openib_component.receive_queues_source && + NULL != values.receive_queues) { + /* If a prior HCA's INI values set a different value for + receive_queues, this is unsupported (see + https://svn.open-mpi.org/trac/ompi/ticket/1285) */ + if (BTL_OPENIB_RQ_SOURCE_HCA_INI == + mca_btl_openib_component.receive_queues_source) { + if (0 != strcmp(values.receive_queues, + mca_btl_openib_component.receive_queues)) { + orte_show_help("help-mpi-btl-openib.txt", + "conflicting receive_queues", true, + orte_process_info.nodename, + ibv_get_device_name(hca->ib_dev), + hca->ib_dev_attr.vendor_id, + hca->ib_dev_attr.vendor_part_id, + values.receive_queues, + ibv_get_device_name(receive_queues_hca->ib_dev), + receive_queues_hca->ib_dev_attr.vendor_id, + receive_queues_hca->ib_dev_attr.vendor_part_id, + mca_btl_openib_component.receive_queues, + opal_install_dirs.pkgdatadir); + ret = OMPI_ERR_RESOURCE_BUSY; + goto error; + } + } else { + if (NULL != mca_btl_openib_component.receive_queues) { + free(mca_btl_openib_component.receive_queues); + } + receive_queues_hca = hca; + mca_btl_openib_component.receive_queues = + strdup(values.receive_queues); + mca_btl_openib_component.receive_queues_source = + BTL_OPENIB_RQ_SOURCE_HCA_INI; + } + } + /* If "use eager rdma" was set, then enable it on this HCA */ if (values.use_eager_rdma_set) { hca->use_eager_rdma = values.use_eager_rdma; } +#if HAVE_XRC + /* if user configured to run with XRC qp and the device doesn't + * support it - we should ignore this hca. Maybe we have another + * one that has XRC support + */ + if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) && + mca_btl_openib_component.num_xrc_qps > 0) { + orte_show_help("help-mpi-btl-openib.txt", + "XRC on device without XRC support", true, + mca_btl_openib_component.num_xrc_qps, + ibv_get_device_name(hca->ib_dev), + orte_process_info.nodename); + ret = OMPI_SUCCESS; + goto error; + } +#endif + /* Allocate the protection domain for the HCA */ hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context); if(NULL == hca->ib_pd){ @@ -1199,10 +1460,7 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) "apm not enough ports", true); mca_btl_openib_component.apm_ports = 0; } - ret = prepare_hca_for_use(hca); - if(OMPI_SUCCESS == ret) { - return OMPI_SUCCESS; - } + return OMPI_SUCCESS; } error: @@ -1560,10 +1818,6 @@ btl_openib_component_init(int *num_btl_modules, dev_sorted = sort_devs_by_distance(ib_devs, num_devs); - /* We must loop through all the hca id's, get their handles and - for each hca we query the number of ports on the hca and set up - a distinct btl module for each hca port */ - OBJ_CONSTRUCT(&btl_list, opal_list_t); OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t); #if OMPI_HAVE_THREADS @@ -1585,7 +1839,8 @@ btl_openib_component_init(int *num_btl_modules, if (OMPI_SUCCESS != ret) { orte_show_help("help-mpi-btl-openib.txt", - "error in hca init", true, orte_process_info.nodename); + "error in hca init", true, orte_process_info.nodename, + ibv_get_device_name(dev_sorted[i].ib_dev)); return NULL; } @@ -1612,6 +1867,45 @@ btl_openib_component_init(int *num_btl_modules, return NULL; } + /* Setup the BSRQ QP's based on the final value of + mca_btl_openib_component.receive_queues. */ + setup_qps(); + + /* Loop through all the btl modules that we made and find every + base HCA that doesn't have hca->qps setup on it yet (remember + that some modules may share the same HCA, so when going through + to loop, we may hit an HCA that was already setup earlier in + the loop). */ + for (item = opal_list_get_first(&btl_list); + opal_list_get_end(&btl_list) != item; + item = opal_list_get_next(item)) { + mca_btl_base_selected_module_t *m = + (mca_btl_base_selected_module_t*) item; + mca_btl_openib_hca_t *hca = + ((mca_btl_openib_module_t*) m->btl_module)->hca; + if (NULL == hca->qps) { + + /* Setup the HCA qps info */ + hca->qps = (mca_btl_openib_hca_qp_t*) + calloc(mca_btl_openib_component.num_qps, + sizeof(mca_btl_openib_hca_qp_t)); + for (i = 0; i < mca_btl_openib_component.num_qps; i++) { + OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t); + OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t); + } + + /* Do finial init on HCA */ + ret = prepare_hca_for_use(hca); + if (OMPI_SUCCESS != ret) { + orte_show_help("help-mpi-btl-openib.txt", + "error in hca init", true, + orte_process_info.nodename, + ibv_get_device_name(hca->ib_dev)); + return NULL; + } + } + } + /* Allocate space for btl modules */ mca_btl_openib_component.openib_btls = malloc(sizeof(mca_btl_openib_module_t*) * diff --git a/ompi/mca/btl/openib/btl_openib_ini.c b/ompi/mca/btl/openib/btl_openib_ini.c index 5ffeab196e..dbc7362a1a 100644 --- a/ompi/mca/btl/openib/btl_openib_ini.c +++ b/ompi/mca/btl/openib/btl_openib_ini.c @@ -23,6 +23,8 @@ #include #include +#include + #include "orte/util/output.h" #include "opal/mca/base/mca_base_param.h" @@ -388,6 +390,12 @@ static int parse_line(parsed_section_values_t *sv) sv->values.use_eager_rdma_set = true; } + else if (0 == strcasecmp(key_buffer, "receive_queues")) { + /* Single value (already strdup'ed) */ + sv->values.receive_queues = value; + value = NULL; + } + else { /* Have no idea what this parameter is. Not an error -- just ignore it */ @@ -429,6 +437,9 @@ static void hca_values_destructor(hca_values_t *s) if (NULL != s->section_name) { free(s->section_name); } + if (NULL != s->values.receive_queues) { + free(s->values.receive_queues); + } } @@ -469,6 +480,8 @@ static void reset_values(ompi_btl_openib_ini_values_t *v) v->use_eager_rdma = 0; v->use_eager_rdma_set = false; + + v->receive_queues = NULL; } @@ -532,6 +545,10 @@ static int save_section(parsed_section_values_t *s) containing bool members by value. So do a memcpy here instead. */ memcpy(&h->values, &s->values, sizeof(s->values)); + /* Need to strdup the string, though */ + if (NULL != h->values.receive_queues) { + h->values.receive_queues = strdup(s->values.receive_queues); + } opal_list_append(&hcas, &h->super); } } @@ -586,14 +603,26 @@ static int intify_list(char *value, uint32_t **values, int *len) *values[0] = (uint32_t) intify(str); *len = 1; } else { - /* If we found a comma, loop over all the values. Be a - little clever in that we alwasy alloc enough space for - an extra value so that when we exit the loop, we don't - have to realloc again to get space for the last item. */ + int newsize = 1; + + /* Count how many values there are and allocate enough space + for them */ + while (NULL != comma) { + ++newsize; + str = comma + 1; + comma = strchr(str, ','); + } + *values = malloc(sizeof(uint32_t) * newsize); + if (NULL == *values) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Iterate over the values and save them */ + str = value; + comma = strchr(str, ','); do { *comma = '\0'; - *values = realloc(*values, sizeof(uint32_t) * (*len + 2)); - (*values)[*len] = (int32_t) intify(str); + (*values)[*len] = (uint32_t) intify(str); ++(*len); str = comma + 1; comma = strchr(str, ','); diff --git a/ompi/mca/btl/openib/btl_openib_ini.h b/ompi/mca/btl/openib/btl_openib_ini.h index 0cacd927dd..5f60c7ec58 100644 --- a/ompi/mca/btl/openib/btl_openib_ini.h +++ b/ompi/mca/btl/openib/btl_openib_ini.h @@ -25,6 +25,8 @@ typedef struct ompi_btl_openib_ini_values_t { uint32_t use_eager_rdma; bool use_eager_rdma_set; + + char *receive_queues; } ompi_btl_openib_ini_values_t; diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index 175a39ca48..bc6dc2e000 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -52,8 +52,6 @@ enum { REGSTR_MAX = 0x88 }; -static int mca_btl_openib_mca_setup_qps(void); - /* * utility routine for string parameter registration @@ -109,6 +107,9 @@ static inline int reg_int(const char* param_name, const char* param_desc, */ int btl_openib_register_mca_params(void) { + char default_qps[100]; + uint32_t mid_qp_size; + int i; char *msg, *str; int ival, ival2, ret, tmp; @@ -485,7 +486,33 @@ int btl_openib_register_mca_params(void) &mca_btl_openib_module.super)); /* setup all the qp stuff */ - CHECK(mca_btl_openib_mca_setup_qps()); + mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4; + /* round mid_qp_size to smallest power of two */ + for(i = 31; i > 0; i--) { + if(!(mid_qp_size & (1< 0; i--) { - if(!(mid_qp_size & (1< qp) - smallest_pp_qp = qp; - } else if (0 == strncmp("S,", queues[qp], 2)) { - num_srq_qps++; - } else if (0 == strncmp("X,", queues[qp], 2)) { -#if HAVE_XRC - num_xrc_qps++; -#else - orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true, - orte_process_info.nodename, str); - goto error; -#endif - } else { - orte_show_help("help-mpi-btl-openib.txt", - "invalid qp type in receive_queues", true, - orte_process_info.nodename, str, queues[qp]); - goto error; - } - qp++; - } - /* Current XRC implementation can't used with other QP types - PP and SRQ */ - if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) { - orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true, - orte_process_info.nodename, str); - goto error; - } - - /* Current XRC implementation can't used with btls_per_lid > 1 */ - if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) { - orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", true, - orte_process_info.nodename, str, num_xrc_qps); - goto error; - } - mca_btl_openib_component.num_pp_qps = num_pp_qps; - mca_btl_openib_component.num_srq_qps = num_srq_qps; - mca_btl_openib_component.num_xrc_qps = num_xrc_qps; - mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps; - - mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*) - malloc(sizeof(mca_btl_openib_qp_info_t) * - mca_btl_openib_component.num_qps); - - qp = 0; -#define P(N) (((N) > count)?NULL:params[(N)]) - while(queues[qp] != NULL) { - int i = 0, count; - int32_t rd_low, rd_num; - params = opal_argv_split_with_empty(queues[qp], ','); - count = opal_argv_count(params); - - if ('P' == params[0][0]) { - int32_t rd_win, rd_rsv; - if (count < 3 || count > 6) { - orte_show_help("help-mpi-btl-openib.txt", - "invalid pp qp specification", true, - orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); - rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); - - BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d", - rd_num, rd_low, rd_win, rd_rsv)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num + rd_rsv > min_freelist_size) - min_freelist_size = rd_num + rd_rsv; - - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; - if((rd_num - rd_low) > rd_win) - orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", - true, rd_win, rd_num - rd_low); - } else { - int32_t sd_max; - if(count < 3 || count > 5) { - orte_show_help("help-mpi-btl-openib.txt", - "invalid srq specification", true, - orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ? - MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - sd_max = atoi_param(P(4), rd_low / 4); - BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d", - rd_num, rd_low, sd_max)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num > min_freelist_size) - min_freelist_size = rd_num; - - mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; - } - - if (rd_num <= rd_low) { - orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", - true, orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].rd_num = rd_num; - mca_btl_openib_component.qp_infos[qp].rd_low = rd_low; - while (NULL != params[i]) { - free(params[i++]); - } - free(params); - qp++; - } - params = NULL; - - /* Sanity check some sizes */ - - max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size; - max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > - mca_btl_openib_module.super.btl_max_send_size) ? - mca_btl_openib_module.super.btl_eager_limit : - mca_btl_openib_module.super.btl_max_send_size; - if (max_qp_size < max_size_needed) { - orte_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too small", true, - orte_process_info.nodename, max_qp_size, - max_size_needed); - ret = OMPI_ERROR; - goto error; - } else if (max_qp_size > max_size_needed) { - orte_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too big", true, - orte_process_info.nodename, max_qp_size, - max_size_needed); - orte_output(0, "The biggest QP size is bigger than maximum send size. " - "This is not optimal configuration as memory will be wasted."); - } - - if (mca_btl_openib_component.ib_free_list_max > 0 && - min_freelist_size > mca_btl_openib_component.ib_free_list_max) { - orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true, - orte_process_info.nodename, - mca_btl_openib_component.ib_free_list_max, - min_freelist_size); - goto error; - } - - mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1; - mca_btl_openib_component.credits_qp = smallest_pp_qp; - /* Register any MCA params for the connect pseudo-components */ - if (OMPI_SUCCESS != ompi_btl_openib_connect_base_register()) - goto error; - - ret = OMPI_SUCCESS; -error: - if(params) { - qp = 0; - while(params[qp] != NULL) - free(params[qp++]); - free(params); - } - - if(queues) { - qp = 0; - while(queues[qp] != NULL) - free(queues[qp++]); - free(queues); + if (OMPI_SUCCESS == ret) { + ret = ompi_btl_openib_connect_base_register(); } return ret; diff --git a/ompi/mca/btl/openib/help-mpi-btl-openib.txt b/ompi/mca/btl/openib/help-mpi-btl-openib.txt index 5808e3e73c..d6b495c068 100644 --- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt +++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt @@ -169,6 +169,12 @@ no active ports detected. This is most certainly not what you wanted. Check your cables and SM configuration. # [error in hca init] +WARNING: There was an error initializing an OpenFabrics NIC/HCA. + +Hostname: %s +Device: %s +# +[error in hca init] WARNING: There were errors during IB HCA initialization on host '%s'. # [default subnet prefix] @@ -448,3 +454,20 @@ Can not provide %d alternative paths with LMC bit configured to %d. [apm not enough ports] WARNING: For APM over ports ompi require at least 2 active ports and only single active port was found. Disabling APM over ports +# +[conflicting receive_queues] +Open MPI detected two different sets of OpenFabrics receives queues on +the same host (in the openib BTL). Open MPI currently only supports +one set of OF receive queues in an MPI job, even if you have different +types of OpenFabrics adapters on the same host. + +Host: %s +Adapter 1: %s (vendor 0x%x, part ID %d) +Queues: %s +Adapter 2: %s (vendor 0x%x, part ID %d) +Queues: %s + +Note that these receive queues values may have come from the Open MPI +adapter default settings file: + + %s/mca-btl-openib-hca-params.ini diff --git a/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini b/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini index 7f4a3703fb..304a5b52c8 100644 --- a/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini +++ b/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini @@ -139,3 +139,4 @@ vendor_id = 0x1425 vendor_part_id = 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0030,0x0031,0x0032 use_eager_rdma = 1 mtu = 2048 +receive_queues = P,65536,256,192,128