diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index fef14d1583..06441f5a8c 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -110,6 +110,13 @@ struct mca_btl_openib_qp_info_t { #define BTL_OPENIB_QP_TYPE_XRC(Q) \ (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP) +typedef enum { + BTL_OPENIB_RQ_SOURCE_DEFAULT, + BTL_OPENIB_RQ_SOURCE_MCA, + BTL_OPENIB_RQ_SOURCE_HCA_INI, + BTL_OPENIB_RQ_SOURCE_HCA_MAX +} btl_openib_receive_queues_source_t; + struct mca_btl_openib_component_t { mca_btl_base_component_1_0_1_t super; /**< base BTL component */ @@ -197,6 +204,11 @@ struct mca_btl_openib_component_t { char *if_exclude; char **if_exclude_list; + /* MCA param btl_openib_receive_queues */ + char *receive_queues; + /* Whether we got a non-default value of btl_openib_receive_queues */ + btl_openib_receive_queues_source_t receive_queues_source; + /** Colon-delimited list of filenames for HCA parameters */ char *hca_params_file_names; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 7d4799563a..02b85f8a49 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -42,6 +42,7 @@ #include "opal/mca/carto/carto.h" #include "opal/mca/carto/base/base.h" #include "opal/mca/paffinity/base/base.h" +#include "opal/mca/installdirs/installdirs.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/util/proc_info.h" @@ -80,6 +81,11 @@ static int btl_openib_component_close(void); static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool); static int btl_openib_component_progress(void); +/* + * Local variables + */ +static mca_btl_openib_hca_t *receive_queues_hca = NULL; + mca_btl_openib_component_t mca_btl_openib_component = { { /* First, the mca_base_component_t struct containing meta information @@ -149,6 +155,9 @@ static int btl_openib_component_close(void) ompi_btl_openib_connect_base_finalize(); ompi_btl_openib_fd_finalize(); ompi_btl_openib_ini_finalize(); + if (NULL != mca_btl_openib_component.receive_queues) { + free(mca_btl_openib_component.receive_queues); + } return OMPI_SUCCESS; } @@ -664,8 +673,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca, static void hca_construct(mca_btl_openib_hca_t *hca) { - int i; - hca->ib_dev = NULL; hca->ib_dev_context = NULL; hca->ib_pd = NULL; @@ -687,13 +694,8 @@ static void hca_construct(mca_btl_openib_hca_t *hca) #if HAVE_XRC hca->xrc_fd = -1; #endif - hca->qps = (mca_btl_openib_hca_qp_t*)calloc(mca_btl_openib_component.num_qps, - sizeof(mca_btl_openib_hca_qp_t)); + hca->qps = NULL; OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t); - for(i = 0; i < mca_btl_openib_component.num_qps; i++) { - OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t); - OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t); - } OBJ_CONSTRUCT(&hca->send_free_control, ompi_free_list_t); } @@ -709,13 +711,14 @@ static void hca_destruct(mca_btl_openib_hca_t *hca) free(hca->eager_rdma_buffers); } OBJ_DESTRUCT(&hca->hca_lock); - for(i = 0; i < mca_btl_openib_component.num_qps; i++) { - OBJ_DESTRUCT(&hca->qps[i].send_free); - OBJ_DESTRUCT(&hca->qps[i].recv_free); - } OBJ_DESTRUCT(&hca->send_free_control); - if(hca->qps) + if (NULL != hca->qps) { + for (i = 0; i < mca_btl_openib_component.num_qps; i++) { + OBJ_DESTRUCT(&hca->qps[i].send_free); + OBJ_DESTRUCT(&hca->qps[i].recv_free); + } free(hca->qps); + } } OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct, @@ -947,6 +950,9 @@ done: return num_ports; } +/* + * Prefer values that are already in the target + */ static void merge_values(ompi_btl_openib_ini_values_t *target, ompi_btl_openib_ini_values_t *src) { @@ -959,6 +965,10 @@ static void merge_values(ompi_btl_openib_ini_values_t *target, target->use_eager_rdma = src->use_eager_rdma; target->use_eager_rdma_set = true; } + + if (NULL == target->receive_queues && NULL != src->receive_queues) { + target->receive_queues = strdup(src->receive_queues); + } } static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag) @@ -969,6 +979,15 @@ static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag) (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type); } +static int32_t atoi_param(char *param, int32_t dflt) +{ + if (NULL == param || '\0' == param[0]) { + return dflt ? dflt : 1; + } + + return atoi(param); +} + static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid) { int index; @@ -985,6 +1004,206 @@ static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid) } } +static int setup_qps(void) +{ + char **queues, **params = NULL; + int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0; + uint32_t max_qp_size, max_size_needed; + int32_t min_freelist_size = 0; + int smallest_pp_qp = 0, ret = OMPI_ERROR; + + queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':'); + if (0 == opal_argv_count(queues)) { + orte_show_help("help-mpi-btl-openib.txt", + "no qps in receive_queues", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + ret = OMPI_ERROR; + goto error; + } + + while (queues[qp] != NULL) { + if (0 == strncmp("P,", queues[qp], 2)) { + num_pp_qps++; + if (smallest_pp_qp > qp) { + smallest_pp_qp = qp; + } + } else if (0 == strncmp("S,", queues[qp], 2)) { + num_srq_qps++; + } else if (0 == strncmp("X,", queues[qp], 2)) { +#if HAVE_XRC + num_xrc_qps++; +#else + orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + ret = OMPI_ERR_RESOURCE_UNAVAILABLE; + goto error; +#endif + } else { + orte_show_help("help-mpi-btl-openib.txt", + "invalid qp type in receive_queues", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues, + queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + qp++; + } + /* Current XRC implementation can't used with other QP types - PP + and SRQ */ + if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) { + orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true, + orte_process_info.nodename, + mca_btl_openib_component.receive_queues); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + + /* Current XRC implementation can't used with btls_per_lid > 1 */ + if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) { + orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", + true, orte_process_info.nodename, + mca_btl_openib_component.receive_queues, num_xrc_qps); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.num_pp_qps = num_pp_qps; + mca_btl_openib_component.num_srq_qps = num_srq_qps; + mca_btl_openib_component.num_xrc_qps = num_xrc_qps; + mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps; + + mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*) + malloc(sizeof(mca_btl_openib_qp_info_t) * + mca_btl_openib_component.num_qps); + + qp = 0; +#define P(N) (((N) > count) ? NULL : params[(N)]) + while (queues[qp] != NULL) { + int count; + int32_t rd_low, rd_num; + params = opal_argv_split_with_empty(queues[qp], ','); + count = opal_argv_count(params); + + if ('P' == params[0][0]) { + int32_t rd_win, rd_rsv; + if (count < 3 || count > 6) { + orte_show_help("help-mpi-btl-openib.txt", + "invalid pp qp specification", true, + orte_process_info.nodename, queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP; + mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); + rd_num = atoi_param(P(2), 256); + /* by default set rd_low to be 3/4 of rd_num */ + rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); + rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); + rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); + + BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d", + rd_num, rd_low, rd_win, rd_rsv)); + + /* Calculate the smallest freelist size that can be allowed */ + if (rd_num + rd_rsv > min_freelist_size) { + min_freelist_size = rd_num + rd_rsv; + } + + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; + if ((rd_num - rd_low) > rd_win) { + orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", + true, rd_win, rd_num - rd_low); + } + } else { + int32_t sd_max; + if (count < 3 || count > 5) { + orte_show_help("help-mpi-btl-openib.txt", + "invalid srq specification", true, + orte_process_info.nodename, queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ? + MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP; + mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); + rd_num = atoi_param(P(2), 256); + /* by default set rd_low to be 3/4 of rd_num */ + rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); + sd_max = atoi_param(P(4), rd_low / 4); + BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d", + rd_num, rd_low, sd_max)); + + /* Calculate the smallest freelist size that can be allowed */ + if (rd_num > min_freelist_size) { + min_freelist_size = rd_num; + } + + mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; + } + + if (rd_num <= rd_low) { + orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", + true, orte_process_info.nodename, queues[qp]); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + mca_btl_openib_component.qp_infos[qp].rd_num = rd_num; + mca_btl_openib_component.qp_infos[qp].rd_low = rd_low; + opal_argv_free(params); + qp++; + } + params = NULL; + + /* Sanity check some sizes */ + + max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size; + max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > + mca_btl_openib_module.super.btl_max_send_size) ? + mca_btl_openib_module.super.btl_eager_limit : + mca_btl_openib_module.super.btl_max_send_size; + if (max_qp_size < max_size_needed) { + orte_show_help("help-mpi-btl-openib.txt", + "biggest qp size is too small", true, + orte_process_info.nodename, max_qp_size, + max_size_needed); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } else if (max_qp_size > max_size_needed) { + orte_show_help("help-mpi-btl-openib.txt", + "biggest qp size is too big", true, + orte_process_info.nodename, max_qp_size, + max_size_needed); + } + + if (mca_btl_openib_component.ib_free_list_max > 0 && + min_freelist_size > mca_btl_openib_component.ib_free_list_max) { + orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true, + orte_process_info.nodename, + mca_btl_openib_component.ib_free_list_max, + min_freelist_size); + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + + mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1; + mca_btl_openib_component.credits_qp = smallest_pp_qp; + + ret = OMPI_SUCCESS; +error: + if (NULL != params) { + opal_argv_free(params); + } + + if (NULL != queues) { + opal_argv_free(queues); + } + + return ret; +} + static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) { struct mca_mpool_base_resources_t mpool_resources; @@ -1023,26 +1242,12 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) /* If mca_btl_if_include/exclude were specified, get usable ports */ allowed_ports = (int*)malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int)); port_cnt = get_port_list(hca, allowed_ports); - if(0 == port_cnt) { - ret = OMPI_SUCCESS; + if (0 == port_cnt) { free(allowed_ports); - goto error; - } -#if HAVE_XRC - /* if user configured to run with XRC qp and the device don't support it - - * we should ignore this hca. Maybe we have other one that have XRC support - */ - if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) && - mca_btl_openib_component.num_xrc_qps > 0) { - orte_show_help("help-mpi-btl-openib.txt", - "XRC on device without XRC support", true, - mca_btl_openib_component.num_xrc_qps, - ibv_get_device_name(hca->ib_dev), - orte_process_info.nodename); ret = OMPI_SUCCESS; goto error; } -#endif + /* Load in vendor/part-specific HCA parameters. Note that even if we don't find values for this vendor/part, "values" will be set indicating that it does not have good values */ @@ -1102,11 +1307,67 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) hca->mtu = mca_btl_openib_component.ib_mtu; } + /* If the user specified btl_openib_receive_queues MCA param, it + overrides all HCA INI params */ + if (BTL_OPENIB_RQ_SOURCE_MCA != + mca_btl_openib_component.receive_queues_source && + NULL != values.receive_queues) { + /* If a prior HCA's INI values set a different value for + receive_queues, this is unsupported (see + https://svn.open-mpi.org/trac/ompi/ticket/1285) */ + if (BTL_OPENIB_RQ_SOURCE_HCA_INI == + mca_btl_openib_component.receive_queues_source) { + if (0 != strcmp(values.receive_queues, + mca_btl_openib_component.receive_queues)) { + orte_show_help("help-mpi-btl-openib.txt", + "conflicting receive_queues", true, + orte_process_info.nodename, + ibv_get_device_name(hca->ib_dev), + hca->ib_dev_attr.vendor_id, + hca->ib_dev_attr.vendor_part_id, + values.receive_queues, + ibv_get_device_name(receive_queues_hca->ib_dev), + receive_queues_hca->ib_dev_attr.vendor_id, + receive_queues_hca->ib_dev_attr.vendor_part_id, + mca_btl_openib_component.receive_queues, + opal_install_dirs.pkgdatadir); + ret = OMPI_ERR_RESOURCE_BUSY; + goto error; + } + } else { + if (NULL != mca_btl_openib_component.receive_queues) { + free(mca_btl_openib_component.receive_queues); + } + receive_queues_hca = hca; + mca_btl_openib_component.receive_queues = + strdup(values.receive_queues); + mca_btl_openib_component.receive_queues_source = + BTL_OPENIB_RQ_SOURCE_HCA_INI; + } + } + /* If "use eager rdma" was set, then enable it on this HCA */ if (values.use_eager_rdma_set) { hca->use_eager_rdma = values.use_eager_rdma; } +#if HAVE_XRC + /* if user configured to run with XRC qp and the device doesn't + * support it - we should ignore this hca. Maybe we have another + * one that has XRC support + */ + if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) && + mca_btl_openib_component.num_xrc_qps > 0) { + orte_show_help("help-mpi-btl-openib.txt", + "XRC on device without XRC support", true, + mca_btl_openib_component.num_xrc_qps, + ibv_get_device_name(hca->ib_dev), + orte_process_info.nodename); + ret = OMPI_SUCCESS; + goto error; + } +#endif + /* Allocate the protection domain for the HCA */ hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context); if(NULL == hca->ib_pd){ @@ -1199,10 +1460,7 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) "apm not enough ports", true); mca_btl_openib_component.apm_ports = 0; } - ret = prepare_hca_for_use(hca); - if(OMPI_SUCCESS == ret) { - return OMPI_SUCCESS; - } + return OMPI_SUCCESS; } error: @@ -1560,10 +1818,6 @@ btl_openib_component_init(int *num_btl_modules, dev_sorted = sort_devs_by_distance(ib_devs, num_devs); - /* We must loop through all the hca id's, get their handles and - for each hca we query the number of ports on the hca and set up - a distinct btl module for each hca port */ - OBJ_CONSTRUCT(&btl_list, opal_list_t); OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t); #if OMPI_HAVE_THREADS @@ -1585,7 +1839,8 @@ btl_openib_component_init(int *num_btl_modules, if (OMPI_SUCCESS != ret) { orte_show_help("help-mpi-btl-openib.txt", - "error in hca init", true, orte_process_info.nodename); + "error in hca init", true, orte_process_info.nodename, + ibv_get_device_name(dev_sorted[i].ib_dev)); return NULL; } @@ -1612,6 +1867,45 @@ btl_openib_component_init(int *num_btl_modules, return NULL; } + /* Setup the BSRQ QP's based on the final value of + mca_btl_openib_component.receive_queues. */ + setup_qps(); + + /* Loop through all the btl modules that we made and find every + base HCA that doesn't have hca->qps setup on it yet (remember + that some modules may share the same HCA, so when going through + to loop, we may hit an HCA that was already setup earlier in + the loop). */ + for (item = opal_list_get_first(&btl_list); + opal_list_get_end(&btl_list) != item; + item = opal_list_get_next(item)) { + mca_btl_base_selected_module_t *m = + (mca_btl_base_selected_module_t*) item; + mca_btl_openib_hca_t *hca = + ((mca_btl_openib_module_t*) m->btl_module)->hca; + if (NULL == hca->qps) { + + /* Setup the HCA qps info */ + hca->qps = (mca_btl_openib_hca_qp_t*) + calloc(mca_btl_openib_component.num_qps, + sizeof(mca_btl_openib_hca_qp_t)); + for (i = 0; i < mca_btl_openib_component.num_qps; i++) { + OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t); + OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t); + } + + /* Do finial init on HCA */ + ret = prepare_hca_for_use(hca); + if (OMPI_SUCCESS != ret) { + orte_show_help("help-mpi-btl-openib.txt", + "error in hca init", true, + orte_process_info.nodename, + ibv_get_device_name(hca->ib_dev)); + return NULL; + } + } + } + /* Allocate space for btl modules */ mca_btl_openib_component.openib_btls = malloc(sizeof(mca_btl_openib_module_t*) * diff --git a/ompi/mca/btl/openib/btl_openib_ini.c b/ompi/mca/btl/openib/btl_openib_ini.c index 5ffeab196e..dbc7362a1a 100644 --- a/ompi/mca/btl/openib/btl_openib_ini.c +++ b/ompi/mca/btl/openib/btl_openib_ini.c @@ -23,6 +23,8 @@ #include #include +#include + #include "orte/util/output.h" #include "opal/mca/base/mca_base_param.h" @@ -388,6 +390,12 @@ static int parse_line(parsed_section_values_t *sv) sv->values.use_eager_rdma_set = true; } + else if (0 == strcasecmp(key_buffer, "receive_queues")) { + /* Single value (already strdup'ed) */ + sv->values.receive_queues = value; + value = NULL; + } + else { /* Have no idea what this parameter is. Not an error -- just ignore it */ @@ -429,6 +437,9 @@ static void hca_values_destructor(hca_values_t *s) if (NULL != s->section_name) { free(s->section_name); } + if (NULL != s->values.receive_queues) { + free(s->values.receive_queues); + } } @@ -469,6 +480,8 @@ static void reset_values(ompi_btl_openib_ini_values_t *v) v->use_eager_rdma = 0; v->use_eager_rdma_set = false; + + v->receive_queues = NULL; } @@ -532,6 +545,10 @@ static int save_section(parsed_section_values_t *s) containing bool members by value. So do a memcpy here instead. */ memcpy(&h->values, &s->values, sizeof(s->values)); + /* Need to strdup the string, though */ + if (NULL != h->values.receive_queues) { + h->values.receive_queues = strdup(s->values.receive_queues); + } opal_list_append(&hcas, &h->super); } } @@ -586,14 +603,26 @@ static int intify_list(char *value, uint32_t **values, int *len) *values[0] = (uint32_t) intify(str); *len = 1; } else { - /* If we found a comma, loop over all the values. Be a - little clever in that we alwasy alloc enough space for - an extra value so that when we exit the loop, we don't - have to realloc again to get space for the last item. */ + int newsize = 1; + + /* Count how many values there are and allocate enough space + for them */ + while (NULL != comma) { + ++newsize; + str = comma + 1; + comma = strchr(str, ','); + } + *values = malloc(sizeof(uint32_t) * newsize); + if (NULL == *values) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Iterate over the values and save them */ + str = value; + comma = strchr(str, ','); do { *comma = '\0'; - *values = realloc(*values, sizeof(uint32_t) * (*len + 2)); - (*values)[*len] = (int32_t) intify(str); + (*values)[*len] = (uint32_t) intify(str); ++(*len); str = comma + 1; comma = strchr(str, ','); diff --git a/ompi/mca/btl/openib/btl_openib_ini.h b/ompi/mca/btl/openib/btl_openib_ini.h index 0cacd927dd..5f60c7ec58 100644 --- a/ompi/mca/btl/openib/btl_openib_ini.h +++ b/ompi/mca/btl/openib/btl_openib_ini.h @@ -25,6 +25,8 @@ typedef struct ompi_btl_openib_ini_values_t { uint32_t use_eager_rdma; bool use_eager_rdma_set; + + char *receive_queues; } ompi_btl_openib_ini_values_t; diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index 175a39ca48..bc6dc2e000 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -52,8 +52,6 @@ enum { REGSTR_MAX = 0x88 }; -static int mca_btl_openib_mca_setup_qps(void); - /* * utility routine for string parameter registration @@ -109,6 +107,9 @@ static inline int reg_int(const char* param_name, const char* param_desc, */ int btl_openib_register_mca_params(void) { + char default_qps[100]; + uint32_t mid_qp_size; + int i; char *msg, *str; int ival, ival2, ret, tmp; @@ -485,7 +486,33 @@ int btl_openib_register_mca_params(void) &mca_btl_openib_module.super)); /* setup all the qp stuff */ - CHECK(mca_btl_openib_mca_setup_qps()); + mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4; + /* round mid_qp_size to smallest power of two */ + for(i = 31; i > 0; i--) { + if(!(mid_qp_size & (1< 0; i--) { - if(!(mid_qp_size & (1< qp) - smallest_pp_qp = qp; - } else if (0 == strncmp("S,", queues[qp], 2)) { - num_srq_qps++; - } else if (0 == strncmp("X,", queues[qp], 2)) { -#if HAVE_XRC - num_xrc_qps++; -#else - orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true, - orte_process_info.nodename, str); - goto error; -#endif - } else { - orte_show_help("help-mpi-btl-openib.txt", - "invalid qp type in receive_queues", true, - orte_process_info.nodename, str, queues[qp]); - goto error; - } - qp++; - } - /* Current XRC implementation can't used with other QP types - PP and SRQ */ - if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) { - orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true, - orte_process_info.nodename, str); - goto error; - } - - /* Current XRC implementation can't used with btls_per_lid > 1 */ - if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) { - orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", true, - orte_process_info.nodename, str, num_xrc_qps); - goto error; - } - mca_btl_openib_component.num_pp_qps = num_pp_qps; - mca_btl_openib_component.num_srq_qps = num_srq_qps; - mca_btl_openib_component.num_xrc_qps = num_xrc_qps; - mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps; - - mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*) - malloc(sizeof(mca_btl_openib_qp_info_t) * - mca_btl_openib_component.num_qps); - - qp = 0; -#define P(N) (((N) > count)?NULL:params[(N)]) - while(queues[qp] != NULL) { - int i = 0, count; - int32_t rd_low, rd_num; - params = opal_argv_split_with_empty(queues[qp], ','); - count = opal_argv_count(params); - - if ('P' == params[0][0]) { - int32_t rd_win, rd_rsv; - if (count < 3 || count > 6) { - orte_show_help("help-mpi-btl-openib.txt", - "invalid pp qp specification", true, - orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); - rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); - - BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d", - rd_num, rd_low, rd_win, rd_rsv)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num + rd_rsv > min_freelist_size) - min_freelist_size = rd_num + rd_rsv; - - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; - if((rd_num - rd_low) > rd_win) - orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", - true, rd_win, rd_num - rd_low); - } else { - int32_t sd_max; - if(count < 3 || count > 5) { - orte_show_help("help-mpi-btl-openib.txt", - "invalid srq specification", true, - orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ? - MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - sd_max = atoi_param(P(4), rd_low / 4); - BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d", - rd_num, rd_low, sd_max)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num > min_freelist_size) - min_freelist_size = rd_num; - - mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; - } - - if (rd_num <= rd_low) { - orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", - true, orte_process_info.nodename, queues[qp]); - goto error; - } - mca_btl_openib_component.qp_infos[qp].rd_num = rd_num; - mca_btl_openib_component.qp_infos[qp].rd_low = rd_low; - while (NULL != params[i]) { - free(params[i++]); - } - free(params); - qp++; - } - params = NULL; - - /* Sanity check some sizes */ - - max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size; - max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > - mca_btl_openib_module.super.btl_max_send_size) ? - mca_btl_openib_module.super.btl_eager_limit : - mca_btl_openib_module.super.btl_max_send_size; - if (max_qp_size < max_size_needed) { - orte_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too small", true, - orte_process_info.nodename, max_qp_size, - max_size_needed); - ret = OMPI_ERROR; - goto error; - } else if (max_qp_size > max_size_needed) { - orte_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too big", true, - orte_process_info.nodename, max_qp_size, - max_size_needed); - orte_output(0, "The biggest QP size is bigger than maximum send size. " - "This is not optimal configuration as memory will be wasted."); - } - - if (mca_btl_openib_component.ib_free_list_max > 0 && - min_freelist_size > mca_btl_openib_component.ib_free_list_max) { - orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true, - orte_process_info.nodename, - mca_btl_openib_component.ib_free_list_max, - min_freelist_size); - goto error; - } - - mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1; - mca_btl_openib_component.credits_qp = smallest_pp_qp; - /* Register any MCA params for the connect pseudo-components */ - if (OMPI_SUCCESS != ompi_btl_openib_connect_base_register()) - goto error; - - ret = OMPI_SUCCESS; -error: - if(params) { - qp = 0; - while(params[qp] != NULL) - free(params[qp++]); - free(params); - } - - if(queues) { - qp = 0; - while(queues[qp] != NULL) - free(queues[qp++]); - free(queues); + if (OMPI_SUCCESS == ret) { + ret = ompi_btl_openib_connect_base_register(); } return ret; diff --git a/ompi/mca/btl/openib/help-mpi-btl-openib.txt b/ompi/mca/btl/openib/help-mpi-btl-openib.txt index 5808e3e73c..d6b495c068 100644 --- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt +++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt @@ -169,6 +169,12 @@ no active ports detected. This is most certainly not what you wanted. Check your cables and SM configuration. # [error in hca init] +WARNING: There was an error initializing an OpenFabrics NIC/HCA. + +Hostname: %s +Device: %s +# +[error in hca init] WARNING: There were errors during IB HCA initialization on host '%s'. # [default subnet prefix] @@ -448,3 +454,20 @@ Can not provide %d alternative paths with LMC bit configured to %d. [apm not enough ports] WARNING: For APM over ports ompi require at least 2 active ports and only single active port was found. Disabling APM over ports +# +[conflicting receive_queues] +Open MPI detected two different sets of OpenFabrics receives queues on +the same host (in the openib BTL). Open MPI currently only supports +one set of OF receive queues in an MPI job, even if you have different +types of OpenFabrics adapters on the same host. + +Host: %s +Adapter 1: %s (vendor 0x%x, part ID %d) +Queues: %s +Adapter 2: %s (vendor 0x%x, part ID %d) +Queues: %s + +Note that these receive queues values may have come from the Open MPI +adapter default settings file: + + %s/mca-btl-openib-hca-params.ini diff --git a/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini b/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini index 7f4a3703fb..304a5b52c8 100644 --- a/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini +++ b/ompi/mca/btl/openib/mca-btl-openib-hca-params.ini @@ -139,3 +139,4 @@ vendor_id = 0x1425 vendor_part_id = 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0030,0x0031,0x0032 use_eager_rdma = 1 mtu = 2048 +receive_queues = P,65536,256,192,128