Fixes trac:1285:
* allow receive_queues to be specified in the INI file * detect when multiple different receive_queues are specified and gracefully abort However, accomplishing these goals ran into multiple difficulties. By putting receive_queues in the INI file: 1. we may not find the value until we've already traversed multiple HCAs 1. we may find multiple different receive_queues values But since the openib btl initializes as it discovers each HCA/port/LID (including the BSRQ data), if we find a new receive_queues value late in the discovery process, then all the BSRQ data that was previously initialized will likely be invalid. So I had to pull all the BSRQ initialization out until after the rest of the discovery / initialization process. Additionally, note that if the user specifies the MCA parameter btl_openib_receive_queues, it trumps whatever was in the INI file. So in this case, there can never be a receive_queues conflict. This commit does the following (Jon wrote part of this, too): * adapt _ini.c to accept the "receive_queues" field in the file * move 90% of _setup_qps() from _ini.c to _component.c * move what was left of _setup_qps() into the main _register_mca_params() function * adapt init_one_hca() to detect conflicting receive_queues values from the INI file * after the _component.c loop calling init_one_hca(): * call setup_qps() to parse the final receive_queues string value * traverse all resulting btls and initialize their HCAs (if they weren't already): setup some lists and call prepare_hca_for_use() I tested this code on a dual-HCA system where I artificially put in differing receive_queues values in the INI file for the two different types of HCAs that I have and it all seemed to work. This commit was SVN r18450. The following Trac tickets were found above: Ticket 1285 --> https://svn.open-mpi.org/trac/ompi/ticket/1285
Этот коммит содержится в:
родитель
87d4201bdf
Коммит
5295902ebe
@ -110,6 +110,13 @@ struct mca_btl_openib_qp_info_t {
|
||||
#define BTL_OPENIB_QP_TYPE_XRC(Q) \
|
||||
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
|
||||
|
||||
typedef enum {
|
||||
BTL_OPENIB_RQ_SOURCE_DEFAULT,
|
||||
BTL_OPENIB_RQ_SOURCE_MCA,
|
||||
BTL_OPENIB_RQ_SOURCE_HCA_INI,
|
||||
BTL_OPENIB_RQ_SOURCE_HCA_MAX
|
||||
} btl_openib_receive_queues_source_t;
|
||||
|
||||
struct mca_btl_openib_component_t {
|
||||
mca_btl_base_component_1_0_1_t super; /**< base BTL component */
|
||||
|
||||
@ -197,6 +204,11 @@ struct mca_btl_openib_component_t {
|
||||
char *if_exclude;
|
||||
char **if_exclude_list;
|
||||
|
||||
/* MCA param btl_openib_receive_queues */
|
||||
char *receive_queues;
|
||||
/* Whether we got a non-default value of btl_openib_receive_queues */
|
||||
btl_openib_receive_queues_source_t receive_queues_source;
|
||||
|
||||
/** Colon-delimited list of filenames for HCA parameters */
|
||||
char *hca_params_file_names;
|
||||
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include "opal/mca/carto/carto.h"
|
||||
#include "opal/mca/carto/base/base.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -80,6 +81,11 @@ static int btl_openib_component_close(void);
|
||||
static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
|
||||
static int btl_openib_component_progress(void);
|
||||
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
static mca_btl_openib_hca_t *receive_queues_hca = NULL;
|
||||
|
||||
mca_btl_openib_component_t mca_btl_openib_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
@ -664,8 +670,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
|
||||
|
||||
static void hca_construct(mca_btl_openib_hca_t *hca)
|
||||
{
|
||||
int i;
|
||||
|
||||
hca->ib_dev = NULL;
|
||||
hca->ib_dev_context = NULL;
|
||||
hca->ib_pd = NULL;
|
||||
@ -687,13 +691,8 @@ static void hca_construct(mca_btl_openib_hca_t *hca)
|
||||
#if HAVE_XRC
|
||||
hca->xrc_fd = -1;
|
||||
#endif
|
||||
hca->qps = (mca_btl_openib_hca_qp_t*)calloc(mca_btl_openib_component.num_qps,
|
||||
sizeof(mca_btl_openib_hca_qp_t));
|
||||
hca->qps = NULL;
|
||||
OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t);
|
||||
for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
||||
OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t);
|
||||
}
|
||||
OBJ_CONSTRUCT(&hca->send_free_control, ompi_free_list_t);
|
||||
}
|
||||
|
||||
@ -709,13 +708,14 @@ static void hca_destruct(mca_btl_openib_hca_t *hca)
|
||||
free(hca->eager_rdma_buffers);
|
||||
}
|
||||
OBJ_DESTRUCT(&hca->hca_lock);
|
||||
for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
||||
OBJ_DESTRUCT(&hca->qps[i].send_free);
|
||||
OBJ_DESTRUCT(&hca->qps[i].recv_free);
|
||||
}
|
||||
OBJ_DESTRUCT(&hca->send_free_control);
|
||||
if(hca->qps)
|
||||
if (hca->qps) {
|
||||
for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
||||
OBJ_DESTRUCT(&hca->qps[i].send_free);
|
||||
OBJ_DESTRUCT(&hca->qps[i].recv_free);
|
||||
}
|
||||
free(hca->qps);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct,
|
||||
@ -947,6 +947,9 @@ done:
|
||||
return num_ports;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prefer values that are already in the target
|
||||
*/
|
||||
static void merge_values(ompi_btl_openib_ini_values_t *target,
|
||||
ompi_btl_openib_ini_values_t *src)
|
||||
{
|
||||
@ -959,6 +962,12 @@ static void merge_values(ompi_btl_openib_ini_values_t *target,
|
||||
target->use_eager_rdma = src->use_eager_rdma;
|
||||
target->use_eager_rdma_set = true;
|
||||
}
|
||||
|
||||
if (!target->receive_queues_set && src->receive_queues_set) {
|
||||
free(target->receive_queues);
|
||||
target->receive_queues = strdup(src->receive_queues);
|
||||
target->receive_queues_set = true;
|
||||
}
|
||||
}
|
||||
|
||||
static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
|
||||
@ -969,6 +978,14 @@ static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
|
||||
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type);
|
||||
}
|
||||
|
||||
static int32_t atoi_param(char *param, int32_t dflt)
|
||||
{
|
||||
if(NULL == param || '\0' == param[0])
|
||||
return dflt ? dflt : 1;
|
||||
|
||||
return atoi(param);
|
||||
}
|
||||
|
||||
static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid)
|
||||
{
|
||||
int index;
|
||||
@ -985,6 +1002,203 @@ static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid)
|
||||
}
|
||||
}
|
||||
|
||||
static int setup_qps(void)
|
||||
{
|
||||
char **queues, **params = NULL;
|
||||
int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0;
|
||||
uint32_t max_qp_size, max_size_needed;
|
||||
int32_t min_freelist_size = 0;
|
||||
int smallest_pp_qp = 0, ret = OMPI_ERROR;
|
||||
|
||||
queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':');
|
||||
|
||||
if (0 == opal_argv_count(queues)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"no qps in receive_queues", true,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
while (queues[qp] != NULL) {
|
||||
if (0 == strncmp("P,", queues[qp], 2)) {
|
||||
num_pp_qps++;
|
||||
if(smallest_pp_qp > qp)
|
||||
smallest_pp_qp = qp;
|
||||
} else if (0 == strncmp("S,", queues[qp], 2)) {
|
||||
num_srq_qps++;
|
||||
} else if (0 == strncmp("X,", queues[qp], 2)) {
|
||||
#if HAVE_XRC
|
||||
num_xrc_qps++;
|
||||
#else
|
||||
orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues);
|
||||
goto error;
|
||||
#endif
|
||||
} else {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid qp type in receive_queues", true,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues,
|
||||
queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
qp++;
|
||||
}
|
||||
/* Current XRC implementation can't used with other QP types - PP
|
||||
and SRQ */
|
||||
if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Current XRC implementation can't used with btls_per_lid > 1 */
|
||||
if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID",
|
||||
true, orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues, num_xrc_qps);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.num_pp_qps = num_pp_qps;
|
||||
mca_btl_openib_component.num_srq_qps = num_srq_qps;
|
||||
mca_btl_openib_component.num_xrc_qps = num_xrc_qps;
|
||||
mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps;
|
||||
|
||||
mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
|
||||
malloc(sizeof(mca_btl_openib_qp_info_t) *
|
||||
mca_btl_openib_component.num_qps);
|
||||
|
||||
qp = 0;
|
||||
#define P(N) (((N) > count)?NULL:params[(N)])
|
||||
while(queues[qp] != NULL) {
|
||||
int i = 0, count;
|
||||
int32_t rd_low, rd_num;
|
||||
params = opal_argv_split_with_empty(queues[qp], ',');
|
||||
count = opal_argv_count(params);
|
||||
|
||||
if ('P' == params[0][0]) {
|
||||
int32_t rd_win, rd_rsv;
|
||||
if (count < 3 || count > 6) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid pp qp specification", true,
|
||||
orte_process_info.nodename, queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP;
|
||||
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
|
||||
rd_num = atoi_param(P(2), 256);
|
||||
/* by default set rd_low to be 3/4 of rd_num */
|
||||
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
|
||||
rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
|
||||
rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);
|
||||
|
||||
BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d",
|
||||
rd_num, rd_low, rd_win, rd_rsv));
|
||||
|
||||
/* Calculate the smallest freelist size that can be allowed */
|
||||
if (rd_num + rd_rsv > min_freelist_size)
|
||||
min_freelist_size = rd_num + rd_rsv;
|
||||
|
||||
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
|
||||
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
|
||||
if((rd_num - rd_low) > rd_win)
|
||||
orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
|
||||
true, rd_win, rd_num - rd_low);
|
||||
} else {
|
||||
int32_t sd_max;
|
||||
if(count < 3 || count > 5) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid srq specification", true,
|
||||
orte_process_info.nodename, queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ?
|
||||
MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP;
|
||||
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
|
||||
rd_num = atoi_param(P(2), 256);
|
||||
/* by default set rd_low to be 3/4 of rd_num */
|
||||
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
|
||||
sd_max = atoi_param(P(4), rd_low / 4);
|
||||
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
|
||||
rd_num, rd_low, sd_max));
|
||||
|
||||
/* Calculate the smallest freelist size that can be allowed */
|
||||
if (rd_num > min_freelist_size)
|
||||
min_freelist_size = rd_num;
|
||||
|
||||
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
|
||||
}
|
||||
|
||||
if (rd_num <= rd_low) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
|
||||
true, orte_process_info.nodename, queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.qp_infos[qp].rd_num = rd_num;
|
||||
mca_btl_openib_component.qp_infos[qp].rd_low = rd_low;
|
||||
while (NULL != params[i]) {
|
||||
free(params[i++]);
|
||||
}
|
||||
free(params);
|
||||
qp++;
|
||||
}
|
||||
params = NULL;
|
||||
|
||||
/* Sanity check some sizes */
|
||||
|
||||
max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size;
|
||||
max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
|
||||
mca_btl_openib_module.super.btl_max_send_size) ?
|
||||
mca_btl_openib_module.super.btl_eager_limit :
|
||||
mca_btl_openib_module.super.btl_max_send_size;
|
||||
if (max_qp_size < max_size_needed) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"biggest qp size is too small", true,
|
||||
orte_process_info.nodename, max_qp_size,
|
||||
max_size_needed);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
} else if (max_qp_size > max_size_needed) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"biggest qp size is too big", true,
|
||||
orte_process_info.nodename, max_qp_size,
|
||||
max_size_needed);
|
||||
}
|
||||
|
||||
if (mca_btl_openib_component.ib_free_list_max > 0 &&
|
||||
min_freelist_size > mca_btl_openib_component.ib_free_list_max) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.ib_free_list_max,
|
||||
min_freelist_size);
|
||||
goto error;
|
||||
}
|
||||
|
||||
mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
|
||||
mca_btl_openib_component.credits_qp = smallest_pp_qp;
|
||||
|
||||
ret = OMPI_SUCCESS;
|
||||
error:
|
||||
if(params) {
|
||||
qp = 0;
|
||||
while(params[qp] != NULL)
|
||||
free(params[qp++]);
|
||||
free(params);
|
||||
}
|
||||
|
||||
if(queues) {
|
||||
qp = 0;
|
||||
while(queues[qp] != NULL)
|
||||
free(queues[qp++]);
|
||||
free(queues);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
{
|
||||
struct mca_mpool_base_resources_t mpool_resources;
|
||||
@ -1024,25 +1238,12 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
allowed_ports = (int*)malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int));
|
||||
port_cnt = get_port_list(hca, allowed_ports);
|
||||
if(0 == port_cnt) {
|
||||
free(allowed_ports);
|
||||
ret = OMPI_SUCCESS;
|
||||
free(allowed_ports);
|
||||
goto error;
|
||||
}
|
||||
#if HAVE_XRC
|
||||
/* if user configured to run with XRC qp and the device don't support it -
|
||||
* we should ignore this hca. Maybe we have other one that have XRC support
|
||||
*/
|
||||
if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) &&
|
||||
mca_btl_openib_component.num_xrc_qps > 0) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"XRC on device without XRC support", true,
|
||||
mca_btl_openib_component.num_xrc_qps,
|
||||
ibv_get_device_name(hca->ib_dev),
|
||||
orte_process_info.nodename);
|
||||
ret = OMPI_SUCCESS;
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Load in vendor/part-specific HCA parameters. Note that even if
|
||||
we don't find values for this vendor/part, "values" will be set
|
||||
indicating that it does not have good values */
|
||||
@ -1102,11 +1303,67 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
hca->mtu = mca_btl_openib_component.ib_mtu;
|
||||
}
|
||||
|
||||
/* If the user specified btl_openib_receive_queues MCA param, it
|
||||
overrides all HCA INI params */
|
||||
if (BTL_OPENIB_RQ_SOURCE_MCA !=
|
||||
mca_btl_openib_component.receive_queues_source &&
|
||||
values.receive_queues_set) {
|
||||
/* If a prior HCA's INI values set a different value for
|
||||
receive_queues, this is unsupported (see
|
||||
https://svn.open-mpi.org/trac/ompi/ticket/1285) */
|
||||
if (BTL_OPENIB_RQ_SOURCE_HCA_INI ==
|
||||
mca_btl_openib_component.receive_queues_source) {
|
||||
if (0 != strcmp(values.receive_queues,
|
||||
mca_btl_openib_component.receive_queues)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"conflicting receive_queues", true,
|
||||
orte_process_info.nodename,
|
||||
ibv_get_device_name(hca->ib_dev),
|
||||
hca->ib_dev_attr.vendor_id,
|
||||
hca->ib_dev_attr.vendor_part_id,
|
||||
values.receive_queues,
|
||||
ibv_get_device_name(receive_queues_hca->ib_dev),
|
||||
receive_queues_hca->ib_dev_attr.vendor_id,
|
||||
receive_queues_hca->ib_dev_attr.vendor_part_id,
|
||||
mca_btl_openib_component.receive_queues,
|
||||
opal_install_dirs.pkgdatadir);
|
||||
ret = OMPI_ERR_RESOURCE_BUSY;
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
if (NULL != mca_btl_openib_component.receive_queues) {
|
||||
free(mca_btl_openib_component.receive_queues);
|
||||
}
|
||||
receive_queues_hca = hca;
|
||||
mca_btl_openib_component.receive_queues =
|
||||
strdup(values.receive_queues);
|
||||
mca_btl_openib_component.receive_queues_source =
|
||||
BTL_OPENIB_RQ_SOURCE_HCA_INI;
|
||||
}
|
||||
}
|
||||
|
||||
/* If "use eager rdma" was set, then enable it on this HCA */
|
||||
if (values.use_eager_rdma_set) {
|
||||
hca->use_eager_rdma = values.use_eager_rdma;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
/* if user configured to run with XRC qp and the device doesn't
|
||||
* support it - we should ignore this hca. Maybe we have another
|
||||
* one that has XRC support
|
||||
*/
|
||||
if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) &&
|
||||
mca_btl_openib_component.num_xrc_qps > 0) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"XRC on device without XRC support", true,
|
||||
mca_btl_openib_component.num_xrc_qps,
|
||||
ibv_get_device_name(hca->ib_dev),
|
||||
orte_process_info.nodename);
|
||||
ret = OMPI_SUCCESS;
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Allocate the protection domain for the HCA */
|
||||
hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context);
|
||||
if(NULL == hca->ib_pd){
|
||||
@ -1199,10 +1456,7 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
"apm not enough ports", true);
|
||||
mca_btl_openib_component.apm_ports = 0;
|
||||
}
|
||||
ret = prepare_hca_for_use(hca);
|
||||
if(OMPI_SUCCESS == ret) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
error:
|
||||
@ -1560,10 +1814,6 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
|
||||
dev_sorted = sort_devs_by_distance(ib_devs, num_devs);
|
||||
|
||||
/* We must loop through all the hca id's, get their handles and
|
||||
for each hca we query the number of ports on the hca and set up
|
||||
a distinct btl module for each hca port */
|
||||
|
||||
OBJ_CONSTRUCT(&btl_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
|
||||
#if OMPI_HAVE_THREADS
|
||||
@ -1579,8 +1829,9 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
(ret = init_one_hca(&btl_list, dev_sorted[i].ib_dev)))
|
||||
(ret = init_one_hca(&btl_list, dev_sorted[i].ib_dev))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
@ -1612,6 +1863,42 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Setup the BSRQ QP's based on the final value of
|
||||
mca_btl_openib_component.receive_queues. */
|
||||
setup_qps();
|
||||
|
||||
/* Loop through all the btl modules that we made and find every
|
||||
base HCA that doesn't have hca->qps setup on it yet (remember
|
||||
that some modules may share the same HCA, so when going through
|
||||
to loop, we may hit an HCA that was already setup earlier in
|
||||
the loop). */
|
||||
for (item = opal_list_get_first(&btl_list);
|
||||
opal_list_get_end(&btl_list) != item;
|
||||
item = opal_list_get_next(item)) {
|
||||
mca_btl_base_selected_module_t *m =
|
||||
(mca_btl_base_selected_module_t*) item;
|
||||
mca_btl_openib_hca_t *hca =
|
||||
((mca_btl_openib_module_t*) m->btl_module)->hca;
|
||||
if (NULL == hca->qps) {
|
||||
|
||||
/* Setup the HCA qps info */
|
||||
hca->qps = (mca_btl_openib_hca_qp_t*)
|
||||
calloc(mca_btl_openib_component.num_qps,
|
||||
sizeof(mca_btl_openib_hca_qp_t));
|
||||
for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
||||
OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t);
|
||||
}
|
||||
|
||||
/* Do finial init on HCA */
|
||||
ret = prepare_hca_for_use(hca);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
/* JMS */
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Allocate space for btl modules */
|
||||
mca_btl_openib_component.openib_btls =
|
||||
malloc(sizeof(mca_btl_openib_module_t*) *
|
||||
|
@ -388,6 +388,11 @@ static int parse_line(parsed_section_values_t *sv)
|
||||
sv->values.use_eager_rdma_set = true;
|
||||
}
|
||||
|
||||
else if (0 == strcasecmp(key_buffer, "receive_queues")) {
|
||||
sv->values.receive_queues = strdup(value);
|
||||
sv->values.receive_queues_set = true;
|
||||
}
|
||||
|
||||
else {
|
||||
/* Have no idea what this parameter is. Not an error -- just
|
||||
ignore it */
|
||||
@ -469,6 +474,9 @@ static void reset_values(ompi_btl_openib_ini_values_t *v)
|
||||
|
||||
v->use_eager_rdma = 0;
|
||||
v->use_eager_rdma_set = false;
|
||||
|
||||
v->receive_queues = NULL;
|
||||
v->receive_queues_set = false;
|
||||
}
|
||||
|
||||
|
||||
|
@ -25,6 +25,9 @@ typedef struct ompi_btl_openib_ini_values_t {
|
||||
|
||||
uint32_t use_eager_rdma;
|
||||
bool use_eager_rdma_set;
|
||||
|
||||
char *receive_queues;
|
||||
bool receive_queues_set;
|
||||
} ompi_btl_openib_ini_values_t;
|
||||
|
||||
|
||||
|
@ -52,8 +52,6 @@ enum {
|
||||
REGSTR_MAX = 0x88
|
||||
};
|
||||
|
||||
static int mca_btl_openib_mca_setup_qps(void);
|
||||
|
||||
|
||||
/*
|
||||
* utility routine for string parameter registration
|
||||
@ -109,6 +107,9 @@ static inline int reg_int(const char* param_name, const char* param_desc,
|
||||
*/
|
||||
int btl_openib_register_mca_params(void)
|
||||
{
|
||||
char default_qps[100];
|
||||
uint32_t mid_qp_size;
|
||||
int i;
|
||||
char *msg, *str;
|
||||
int ival, ival2, ret, tmp;
|
||||
|
||||
@ -485,7 +486,33 @@ int btl_openib_register_mca_params(void)
|
||||
&mca_btl_openib_module.super));
|
||||
|
||||
/* setup all the qp stuff */
|
||||
CHECK(mca_btl_openib_mca_setup_qps());
|
||||
mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4;
|
||||
/* round mid_qp_size to smallest power of two */
|
||||
for(i = 31; i > 0; i--) {
|
||||
if (!(mid_qp_size & (1<<i))) {
|
||||
continue;
|
||||
}
|
||||
mid_qp_size = (1<<i);
|
||||
break;
|
||||
}
|
||||
|
||||
if (mid_qp_size <= 128) {
|
||||
mid_qp_size = 1024;
|
||||
}
|
||||
|
||||
snprintf(default_qps, 100,
|
||||
"P,128,256,192,128:S,%u,256,128,32:S,%u,256,128,32:S,%u,256,128,32",
|
||||
mid_qp_size,
|
||||
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
|
||||
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
|
||||
CHECK(reg_string("receive_queues",
|
||||
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
|
||||
default_qps, &mca_btl_openib_component.receive_queues,
|
||||
0));
|
||||
mca_btl_openib_component.receive_queues_source =
|
||||
(0 == strcmp(default_qps,
|
||||
mca_btl_openib_component.receive_queues)) ?
|
||||
BTL_OPENIB_RQ_SOURCE_DEFAULT : BTL_OPENIB_RQ_SOURCE_MCA;
|
||||
|
||||
CHECK(reg_string("if_include",
|
||||
"Comma-delimited list of HCAs/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with btl_openib_if_exclude.",
|
||||
@ -497,232 +524,9 @@ int btl_openib_register_mca_params(void)
|
||||
NULL, &mca_btl_openib_component.if_exclude,
|
||||
0));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int32_t atoi_param(char *param, int32_t dflt)
|
||||
{
|
||||
if(NULL == param || '\0' == param[0])
|
||||
return dflt ? dflt : 1;
|
||||
|
||||
return atoi(param);
|
||||
}
|
||||
|
||||
static int mca_btl_openib_mca_setup_qps(void)
|
||||
{
|
||||
/* All the multi-qp stuff.. */
|
||||
char *str;
|
||||
char **queues, **params = NULL;
|
||||
int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0;
|
||||
char default_qps[100];
|
||||
uint32_t max_qp_size, max_size_needed;
|
||||
int32_t min_freelist_size = 0;
|
||||
int smallest_pp_qp = 0, ret = OMPI_ERROR, i;
|
||||
uint32_t mid_qp_size;
|
||||
|
||||
mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4;
|
||||
/* round mid_qp_size to smallest power of two */
|
||||
for(i = 31; i > 0; i--) {
|
||||
if(!(mid_qp_size & (1<<i)))
|
||||
continue;
|
||||
mid_qp_size = (1<<i);
|
||||
break;
|
||||
}
|
||||
|
||||
if(mid_qp_size <= 128)
|
||||
mid_qp_size = 1024;
|
||||
|
||||
snprintf(default_qps, 100,
|
||||
"P,128,256,192,128:S,%u,256,128,32:S,%u,256,128,32:S,%u,256,128,32",
|
||||
mid_qp_size,
|
||||
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
|
||||
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
|
||||
reg_string("receive_queues",
|
||||
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
|
||||
default_qps, &str, 0);
|
||||
queues = opal_argv_split(str, ':');
|
||||
|
||||
if (0 == opal_argv_count(queues)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"no qps in receive_queues", true,
|
||||
orte_process_info.nodename, str);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
while (queues[qp] != NULL) {
|
||||
if (0 == strncmp("P,", queues[qp], 2)) {
|
||||
num_pp_qps++;
|
||||
if(smallest_pp_qp > qp)
|
||||
smallest_pp_qp = qp;
|
||||
} else if (0 == strncmp("S,", queues[qp], 2)) {
|
||||
num_srq_qps++;
|
||||
} else if (0 == strncmp("X,", queues[qp], 2)) {
|
||||
#if HAVE_XRC
|
||||
num_xrc_qps++;
|
||||
#else
|
||||
orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
|
||||
orte_process_info.nodename, str);
|
||||
goto error;
|
||||
#endif
|
||||
} else {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid qp type in receive_queues", true,
|
||||
orte_process_info.nodename, str, queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
qp++;
|
||||
}
|
||||
/* Current XRC implementation can't used with other QP types - PP and SRQ */
|
||||
if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
|
||||
orte_process_info.nodename, str);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Current XRC implementation can't used with btls_per_lid > 1 */
|
||||
if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", true,
|
||||
orte_process_info.nodename, str, num_xrc_qps);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.num_pp_qps = num_pp_qps;
|
||||
mca_btl_openib_component.num_srq_qps = num_srq_qps;
|
||||
mca_btl_openib_component.num_xrc_qps = num_xrc_qps;
|
||||
mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps;
|
||||
|
||||
mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
|
||||
malloc(sizeof(mca_btl_openib_qp_info_t) *
|
||||
mca_btl_openib_component.num_qps);
|
||||
|
||||
qp = 0;
|
||||
#define P(N) (((N) > count)?NULL:params[(N)])
|
||||
while(queues[qp] != NULL) {
|
||||
int i = 0, count;
|
||||
int32_t rd_low, rd_num;
|
||||
params = opal_argv_split_with_empty(queues[qp], ',');
|
||||
count = opal_argv_count(params);
|
||||
|
||||
if ('P' == params[0][0]) {
|
||||
int32_t rd_win, rd_rsv;
|
||||
if (count < 3 || count > 6) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid pp qp specification", true,
|
||||
orte_process_info.nodename, queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP;
|
||||
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
|
||||
rd_num = atoi_param(P(2), 256);
|
||||
/* by default set rd_low to be 3/4 of rd_num */
|
||||
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
|
||||
rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
|
||||
rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);
|
||||
|
||||
BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d",
|
||||
rd_num, rd_low, rd_win, rd_rsv));
|
||||
|
||||
/* Calculate the smallest freelist size that can be allowed */
|
||||
if (rd_num + rd_rsv > min_freelist_size)
|
||||
min_freelist_size = rd_num + rd_rsv;
|
||||
|
||||
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
|
||||
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
|
||||
if((rd_num - rd_low) > rd_win)
|
||||
orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
|
||||
true, rd_win, rd_num - rd_low);
|
||||
} else {
|
||||
int32_t sd_max;
|
||||
if(count < 3 || count > 5) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid srq specification", true,
|
||||
orte_process_info.nodename, queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ?
|
||||
MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP;
|
||||
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
|
||||
rd_num = atoi_param(P(2), 256);
|
||||
/* by default set rd_low to be 3/4 of rd_num */
|
||||
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
|
||||
sd_max = atoi_param(P(4), rd_low / 4);
|
||||
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
|
||||
rd_num, rd_low, sd_max));
|
||||
|
||||
/* Calculate the smallest freelist size that can be allowed */
|
||||
if (rd_num > min_freelist_size)
|
||||
min_freelist_size = rd_num;
|
||||
|
||||
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
|
||||
}
|
||||
|
||||
if (rd_num <= rd_low) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
|
||||
true, orte_process_info.nodename, queues[qp]);
|
||||
goto error;
|
||||
}
|
||||
mca_btl_openib_component.qp_infos[qp].rd_num = rd_num;
|
||||
mca_btl_openib_component.qp_infos[qp].rd_low = rd_low;
|
||||
while (NULL != params[i]) {
|
||||
free(params[i++]);
|
||||
}
|
||||
free(params);
|
||||
qp++;
|
||||
}
|
||||
params = NULL;
|
||||
|
||||
/* Sanity check some sizes */
|
||||
|
||||
max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size;
|
||||
max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
|
||||
mca_btl_openib_module.super.btl_max_send_size) ?
|
||||
mca_btl_openib_module.super.btl_eager_limit :
|
||||
mca_btl_openib_module.super.btl_max_send_size;
|
||||
if (max_qp_size < max_size_needed) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"biggest qp size is too small", true,
|
||||
orte_process_info.nodename, max_qp_size,
|
||||
max_size_needed);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
} else if (max_qp_size > max_size_needed) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"biggest qp size is too big", true,
|
||||
orte_process_info.nodename, max_qp_size,
|
||||
max_size_needed);
|
||||
orte_output(0, "The biggest QP size is bigger than maximum send size. "
|
||||
"This is not optimal configuration as memory will be wasted.");
|
||||
}
|
||||
|
||||
if (mca_btl_openib_component.ib_free_list_max > 0 &&
|
||||
min_freelist_size > mca_btl_openib_component.ib_free_list_max) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.ib_free_list_max,
|
||||
min_freelist_size);
|
||||
goto error;
|
||||
}
|
||||
|
||||
mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
|
||||
mca_btl_openib_component.credits_qp = smallest_pp_qp;
|
||||
|
||||
/* Register any MCA params for the connect pseudo-components */
|
||||
if (OMPI_SUCCESS != ompi_btl_openib_connect_base_register())
|
||||
goto error;
|
||||
|
||||
ret = OMPI_SUCCESS;
|
||||
error:
|
||||
if(params) {
|
||||
qp = 0;
|
||||
while(params[qp] != NULL)
|
||||
free(params[qp++]);
|
||||
free(params);
|
||||
}
|
||||
|
||||
if(queues) {
|
||||
qp = 0;
|
||||
while(queues[qp] != NULL)
|
||||
free(queues[qp++]);
|
||||
free(queues);
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
ret = ompi_btl_openib_connect_base_register();
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -448,3 +448,20 @@ Can not provide %d alternative paths with LMC bit configured to %d.
|
||||
[apm not enough ports]
|
||||
WARNING: For APM over ports ompi require at least 2 active ports and only single
|
||||
active port was found. Disabling APM over ports
|
||||
#
|
||||
[conflicting receive_queues]
|
||||
Open MPI detected two different sets of OpenFabrics receives queues on
|
||||
the same host (in the openib BTL). Open MPI currently only supports
|
||||
one set of OF receive queues in an MPI job, even if you have different
|
||||
types of OpenFabrics adapters on the same host.
|
||||
|
||||
Host: %s
|
||||
Adapter 1: %s (vendor 0x%x, part ID %d)
|
||||
Queues: %s
|
||||
Adapter 2: %s (vendor 0x%x, part ID %d)
|
||||
Queues: %s
|
||||
|
||||
Note that these receive queues values may have come from the Open MPI
|
||||
adapter default settings file:
|
||||
|
||||
%s/mca-btl-openib-hca-params.ini
|
||||
|
@ -139,3 +139,4 @@ vendor_id = 0x1425
|
||||
vendor_part_id = 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0030,0x0031,0x0032
|
||||
use_eager_rdma = 1
|
||||
mtu = 2048
|
||||
receive_queues = P,65536,256,192,128
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user