1
1
This commit has the same commit message as r18450, but without the
extra bonus memory corruption that was introduced.

This commit was SVN r18467.

The following SVN revision numbers were found above:
  r18450 --> open-mpi/ompi@5295902ebe

The following Trac tickets were found above:
  Ticket 1285 --> https://svn.open-mpi.org/trac/ompi/ticket/1285
Этот коммит содержится в:
Jeff Squyres 2008-05-20 21:53:42 +00:00
родитель 0500420bec
Коммит 64f61ebd07
7 изменённых файлов: 437 добавлений и 272 удалений

Просмотреть файл

@ -110,6 +110,13 @@ struct mca_btl_openib_qp_info_t {
#define BTL_OPENIB_QP_TYPE_XRC(Q) \ #define BTL_OPENIB_QP_TYPE_XRC(Q) \
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP) (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
typedef enum {
BTL_OPENIB_RQ_SOURCE_DEFAULT,
BTL_OPENIB_RQ_SOURCE_MCA,
BTL_OPENIB_RQ_SOURCE_HCA_INI,
BTL_OPENIB_RQ_SOURCE_HCA_MAX
} btl_openib_receive_queues_source_t;
struct mca_btl_openib_component_t { struct mca_btl_openib_component_t {
mca_btl_base_component_1_0_1_t super; /**< base BTL component */ mca_btl_base_component_1_0_1_t super; /**< base BTL component */
@ -197,6 +204,11 @@ struct mca_btl_openib_component_t {
char *if_exclude; char *if_exclude;
char **if_exclude_list; char **if_exclude_list;
/* MCA param btl_openib_receive_queues */
char *receive_queues;
/* Whether we got a non-default value of btl_openib_receive_queues */
btl_openib_receive_queues_source_t receive_queues_source;
/** Colon-delimited list of filenames for HCA parameters */ /** Colon-delimited list of filenames for HCA parameters */
char *hca_params_file_names; char *hca_params_file_names;

Просмотреть файл

@ -42,6 +42,7 @@
#include "opal/mca/carto/carto.h" #include "opal/mca/carto/carto.h"
#include "opal/mca/carto/base/base.h" #include "opal/mca/carto/base/base.h"
#include "opal/mca/paffinity/base/base.h" #include "opal/mca/paffinity/base/base.h"
#include "opal/mca/installdirs/installdirs.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
@ -80,6 +81,11 @@ static int btl_openib_component_close(void);
static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool); static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
static int btl_openib_component_progress(void); static int btl_openib_component_progress(void);
/*
* Local variables
*/
static mca_btl_openib_hca_t *receive_queues_hca = NULL;
mca_btl_openib_component_t mca_btl_openib_component = { mca_btl_openib_component_t mca_btl_openib_component = {
{ {
/* First, the mca_base_component_t struct containing meta information /* First, the mca_base_component_t struct containing meta information
@ -149,6 +155,9 @@ static int btl_openib_component_close(void)
ompi_btl_openib_connect_base_finalize(); ompi_btl_openib_connect_base_finalize();
ompi_btl_openib_fd_finalize(); ompi_btl_openib_fd_finalize();
ompi_btl_openib_ini_finalize(); ompi_btl_openib_ini_finalize();
if (NULL != mca_btl_openib_component.receive_queues) {
free(mca_btl_openib_component.receive_queues);
}
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -664,8 +673,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
static void hca_construct(mca_btl_openib_hca_t *hca) static void hca_construct(mca_btl_openib_hca_t *hca)
{ {
int i;
hca->ib_dev = NULL; hca->ib_dev = NULL;
hca->ib_dev_context = NULL; hca->ib_dev_context = NULL;
hca->ib_pd = NULL; hca->ib_pd = NULL;
@ -687,13 +694,8 @@ static void hca_construct(mca_btl_openib_hca_t *hca)
#if HAVE_XRC #if HAVE_XRC
hca->xrc_fd = -1; hca->xrc_fd = -1;
#endif #endif
hca->qps = (mca_btl_openib_hca_qp_t*)calloc(mca_btl_openib_component.num_qps, hca->qps = NULL;
sizeof(mca_btl_openib_hca_qp_t));
OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t); OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t);
for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t);
OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t);
}
OBJ_CONSTRUCT(&hca->send_free_control, ompi_free_list_t); OBJ_CONSTRUCT(&hca->send_free_control, ompi_free_list_t);
} }
@ -709,13 +711,14 @@ static void hca_destruct(mca_btl_openib_hca_t *hca)
free(hca->eager_rdma_buffers); free(hca->eager_rdma_buffers);
} }
OBJ_DESTRUCT(&hca->hca_lock); OBJ_DESTRUCT(&hca->hca_lock);
for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
OBJ_DESTRUCT(&hca->qps[i].send_free);
OBJ_DESTRUCT(&hca->qps[i].recv_free);
}
OBJ_DESTRUCT(&hca->send_free_control); OBJ_DESTRUCT(&hca->send_free_control);
if(hca->qps) if (NULL != hca->qps) {
for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
OBJ_DESTRUCT(&hca->qps[i].send_free);
OBJ_DESTRUCT(&hca->qps[i].recv_free);
}
free(hca->qps); free(hca->qps);
}
} }
OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct, OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct,
@ -947,6 +950,9 @@ done:
return num_ports; return num_ports;
} }
/*
* Prefer values that are already in the target
*/
static void merge_values(ompi_btl_openib_ini_values_t *target, static void merge_values(ompi_btl_openib_ini_values_t *target,
ompi_btl_openib_ini_values_t *src) ompi_btl_openib_ini_values_t *src)
{ {
@ -959,6 +965,10 @@ static void merge_values(ompi_btl_openib_ini_values_t *target,
target->use_eager_rdma = src->use_eager_rdma; target->use_eager_rdma = src->use_eager_rdma;
target->use_eager_rdma_set = true; target->use_eager_rdma_set = true;
} }
if (NULL == target->receive_queues && NULL != src->receive_queues) {
target->receive_queues = strdup(src->receive_queues);
}
} }
static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag) static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
@ -969,6 +979,15 @@ static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type); (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type);
} }
static int32_t atoi_param(char *param, int32_t dflt)
{
if (NULL == param || '\0' == param[0]) {
return dflt ? dflt : 1;
}
return atoi(param);
}
static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid) static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid)
{ {
int index; int index;
@ -985,6 +1004,206 @@ static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid)
} }
} }
static int setup_qps(void)
{
char **queues, **params = NULL;
int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0;
uint32_t max_qp_size, max_size_needed;
int32_t min_freelist_size = 0;
int smallest_pp_qp = 0, ret = OMPI_ERROR;
queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':');
if (0 == opal_argv_count(queues)) {
orte_show_help("help-mpi-btl-openib.txt",
"no qps in receive_queues", true,
orte_process_info.nodename,
mca_btl_openib_component.receive_queues);
ret = OMPI_ERROR;
goto error;
}
while (queues[qp] != NULL) {
if (0 == strncmp("P,", queues[qp], 2)) {
num_pp_qps++;
if (smallest_pp_qp > qp) {
smallest_pp_qp = qp;
}
} else if (0 == strncmp("S,", queues[qp], 2)) {
num_srq_qps++;
} else if (0 == strncmp("X,", queues[qp], 2)) {
#if HAVE_XRC
num_xrc_qps++;
#else
orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
orte_process_info.nodename,
mca_btl_openib_component.receive_queues);
ret = OMPI_ERR_RESOURCE_UNAVAILABLE;
goto error;
#endif
} else {
orte_show_help("help-mpi-btl-openib.txt",
"invalid qp type in receive_queues", true,
orte_process_info.nodename,
mca_btl_openib_component.receive_queues,
queues[qp]);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
qp++;
}
/* Current XRC implementation can't used with other QP types - PP
and SRQ */
if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
orte_process_info.nodename,
mca_btl_openib_component.receive_queues);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
/* Current XRC implementation can't used with btls_per_lid > 1 */
if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID",
true, orte_process_info.nodename,
mca_btl_openib_component.receive_queues, num_xrc_qps);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
mca_btl_openib_component.num_pp_qps = num_pp_qps;
mca_btl_openib_component.num_srq_qps = num_srq_qps;
mca_btl_openib_component.num_xrc_qps = num_xrc_qps;
mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps;
mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
malloc(sizeof(mca_btl_openib_qp_info_t) *
mca_btl_openib_component.num_qps);
qp = 0;
#define P(N) (((N) > count) ? NULL : params[(N)])
while (queues[qp] != NULL) {
int count;
int32_t rd_low, rd_num;
params = opal_argv_split_with_empty(queues[qp], ',');
count = opal_argv_count(params);
if ('P' == params[0][0]) {
int32_t rd_win, rd_rsv;
if (count < 3 || count > 6) {
orte_show_help("help-mpi-btl-openib.txt",
"invalid pp qp specification", true,
orte_process_info.nodename, queues[qp]);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP;
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
rd_num = atoi_param(P(2), 256);
/* by default set rd_low to be 3/4 of rd_num */
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);
BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d",
rd_num, rd_low, rd_win, rd_rsv));
/* Calculate the smallest freelist size that can be allowed */
if (rd_num + rd_rsv > min_freelist_size) {
min_freelist_size = rd_num + rd_rsv;
}
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
if ((rd_num - rd_low) > rd_win) {
orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
true, rd_win, rd_num - rd_low);
}
} else {
int32_t sd_max;
if (count < 3 || count > 5) {
orte_show_help("help-mpi-btl-openib.txt",
"invalid srq specification", true,
orte_process_info.nodename, queues[qp]);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ?
MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP;
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
rd_num = atoi_param(P(2), 256);
/* by default set rd_low to be 3/4 of rd_num */
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
sd_max = atoi_param(P(4), rd_low / 4);
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
rd_num, rd_low, sd_max));
/* Calculate the smallest freelist size that can be allowed */
if (rd_num > min_freelist_size) {
min_freelist_size = rd_num;
}
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
}
if (rd_num <= rd_low) {
orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
true, orte_process_info.nodename, queues[qp]);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
mca_btl_openib_component.qp_infos[qp].rd_num = rd_num;
mca_btl_openib_component.qp_infos[qp].rd_low = rd_low;
opal_argv_free(params);
qp++;
}
params = NULL;
/* Sanity check some sizes */
max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size;
max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
mca_btl_openib_module.super.btl_max_send_size) ?
mca_btl_openib_module.super.btl_eager_limit :
mca_btl_openib_module.super.btl_max_send_size;
if (max_qp_size < max_size_needed) {
orte_show_help("help-mpi-btl-openib.txt",
"biggest qp size is too small", true,
orte_process_info.nodename, max_qp_size,
max_size_needed);
ret = OMPI_ERR_BAD_PARAM;
goto error;
} else if (max_qp_size > max_size_needed) {
orte_show_help("help-mpi-btl-openib.txt",
"biggest qp size is too big", true,
orte_process_info.nodename, max_qp_size,
max_size_needed);
}
if (mca_btl_openib_component.ib_free_list_max > 0 &&
min_freelist_size > mca_btl_openib_component.ib_free_list_max) {
orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true,
orte_process_info.nodename,
mca_btl_openib_component.ib_free_list_max,
min_freelist_size);
ret = OMPI_ERR_BAD_PARAM;
goto error;
}
mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
mca_btl_openib_component.credits_qp = smallest_pp_qp;
ret = OMPI_SUCCESS;
error:
if (NULL != params) {
opal_argv_free(params);
}
if (NULL != queues) {
opal_argv_free(queues);
}
return ret;
}
static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev) static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
{ {
struct mca_mpool_base_resources_t mpool_resources; struct mca_mpool_base_resources_t mpool_resources;
@ -1023,26 +1242,12 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
/* If mca_btl_if_include/exclude were specified, get usable ports */ /* If mca_btl_if_include/exclude were specified, get usable ports */
allowed_ports = (int*)malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int)); allowed_ports = (int*)malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int));
port_cnt = get_port_list(hca, allowed_ports); port_cnt = get_port_list(hca, allowed_ports);
if(0 == port_cnt) { if (0 == port_cnt) {
ret = OMPI_SUCCESS;
free(allowed_ports); free(allowed_ports);
goto error;
}
#if HAVE_XRC
/* if user configured to run with XRC qp and the device don't support it -
* we should ignore this hca. Maybe we have other one that have XRC support
*/
if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) &&
mca_btl_openib_component.num_xrc_qps > 0) {
orte_show_help("help-mpi-btl-openib.txt",
"XRC on device without XRC support", true,
mca_btl_openib_component.num_xrc_qps,
ibv_get_device_name(hca->ib_dev),
orte_process_info.nodename);
ret = OMPI_SUCCESS; ret = OMPI_SUCCESS;
goto error; goto error;
} }
#endif
/* Load in vendor/part-specific HCA parameters. Note that even if /* Load in vendor/part-specific HCA parameters. Note that even if
we don't find values for this vendor/part, "values" will be set we don't find values for this vendor/part, "values" will be set
indicating that it does not have good values */ indicating that it does not have good values */
@ -1102,11 +1307,67 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
hca->mtu = mca_btl_openib_component.ib_mtu; hca->mtu = mca_btl_openib_component.ib_mtu;
} }
/* If the user specified btl_openib_receive_queues MCA param, it
overrides all HCA INI params */
if (BTL_OPENIB_RQ_SOURCE_MCA !=
mca_btl_openib_component.receive_queues_source &&
NULL != values.receive_queues) {
/* If a prior HCA's INI values set a different value for
receive_queues, this is unsupported (see
https://svn.open-mpi.org/trac/ompi/ticket/1285) */
if (BTL_OPENIB_RQ_SOURCE_HCA_INI ==
mca_btl_openib_component.receive_queues_source) {
if (0 != strcmp(values.receive_queues,
mca_btl_openib_component.receive_queues)) {
orte_show_help("help-mpi-btl-openib.txt",
"conflicting receive_queues", true,
orte_process_info.nodename,
ibv_get_device_name(hca->ib_dev),
hca->ib_dev_attr.vendor_id,
hca->ib_dev_attr.vendor_part_id,
values.receive_queues,
ibv_get_device_name(receive_queues_hca->ib_dev),
receive_queues_hca->ib_dev_attr.vendor_id,
receive_queues_hca->ib_dev_attr.vendor_part_id,
mca_btl_openib_component.receive_queues,
opal_install_dirs.pkgdatadir);
ret = OMPI_ERR_RESOURCE_BUSY;
goto error;
}
} else {
if (NULL != mca_btl_openib_component.receive_queues) {
free(mca_btl_openib_component.receive_queues);
}
receive_queues_hca = hca;
mca_btl_openib_component.receive_queues =
strdup(values.receive_queues);
mca_btl_openib_component.receive_queues_source =
BTL_OPENIB_RQ_SOURCE_HCA_INI;
}
}
/* If "use eager rdma" was set, then enable it on this HCA */ /* If "use eager rdma" was set, then enable it on this HCA */
if (values.use_eager_rdma_set) { if (values.use_eager_rdma_set) {
hca->use_eager_rdma = values.use_eager_rdma; hca->use_eager_rdma = values.use_eager_rdma;
} }
#if HAVE_XRC
/* if user configured to run with XRC qp and the device doesn't
* support it - we should ignore this hca. Maybe we have another
* one that has XRC support
*/
if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) &&
mca_btl_openib_component.num_xrc_qps > 0) {
orte_show_help("help-mpi-btl-openib.txt",
"XRC on device without XRC support", true,
mca_btl_openib_component.num_xrc_qps,
ibv_get_device_name(hca->ib_dev),
orte_process_info.nodename);
ret = OMPI_SUCCESS;
goto error;
}
#endif
/* Allocate the protection domain for the HCA */ /* Allocate the protection domain for the HCA */
hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context); hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context);
if(NULL == hca->ib_pd){ if(NULL == hca->ib_pd){
@ -1199,10 +1460,7 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
"apm not enough ports", true); "apm not enough ports", true);
mca_btl_openib_component.apm_ports = 0; mca_btl_openib_component.apm_ports = 0;
} }
ret = prepare_hca_for_use(hca); return OMPI_SUCCESS;
if(OMPI_SUCCESS == ret) {
return OMPI_SUCCESS;
}
} }
error: error:
@ -1560,10 +1818,6 @@ btl_openib_component_init(int *num_btl_modules,
dev_sorted = sort_devs_by_distance(ib_devs, num_devs); dev_sorted = sort_devs_by_distance(ib_devs, num_devs);
/* We must loop through all the hca id's, get their handles and
for each hca we query the number of ports on the hca and set up
a distinct btl module for each hca port */
OBJ_CONSTRUCT(&btl_list, opal_list_t); OBJ_CONSTRUCT(&btl_list, opal_list_t);
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
#if OMPI_HAVE_THREADS #if OMPI_HAVE_THREADS
@ -1585,7 +1839,8 @@ btl_openib_component_init(int *num_btl_modules,
if (OMPI_SUCCESS != ret) { if (OMPI_SUCCESS != ret) {
orte_show_help("help-mpi-btl-openib.txt", orte_show_help("help-mpi-btl-openib.txt",
"error in hca init", true, orte_process_info.nodename); "error in hca init", true, orte_process_info.nodename,
ibv_get_device_name(dev_sorted[i].ib_dev));
return NULL; return NULL;
} }
@ -1612,6 +1867,45 @@ btl_openib_component_init(int *num_btl_modules,
return NULL; return NULL;
} }
/* Setup the BSRQ QP's based on the final value of
mca_btl_openib_component.receive_queues. */
setup_qps();
/* Loop through all the btl modules that we made and find every
base HCA that doesn't have hca->qps setup on it yet (remember
that some modules may share the same HCA, so when going through
to loop, we may hit an HCA that was already setup earlier in
the loop). */
for (item = opal_list_get_first(&btl_list);
opal_list_get_end(&btl_list) != item;
item = opal_list_get_next(item)) {
mca_btl_base_selected_module_t *m =
(mca_btl_base_selected_module_t*) item;
mca_btl_openib_hca_t *hca =
((mca_btl_openib_module_t*) m->btl_module)->hca;
if (NULL == hca->qps) {
/* Setup the HCA qps info */
hca->qps = (mca_btl_openib_hca_qp_t*)
calloc(mca_btl_openib_component.num_qps,
sizeof(mca_btl_openib_hca_qp_t));
for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t);
OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t);
}
/* Do finial init on HCA */
ret = prepare_hca_for_use(hca);
if (OMPI_SUCCESS != ret) {
orte_show_help("help-mpi-btl-openib.txt",
"error in hca init", true,
orte_process_info.nodename,
ibv_get_device_name(hca->ib_dev));
return NULL;
}
}
}
/* Allocate space for btl modules */ /* Allocate space for btl modules */
mca_btl_openib_component.openib_btls = mca_btl_openib_component.openib_btls =
malloc(sizeof(mca_btl_openib_module_t*) * malloc(sizeof(mca_btl_openib_module_t*) *

Просмотреть файл

@ -23,6 +23,8 @@
#include <ctype.h> #include <ctype.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include "orte/util/output.h" #include "orte/util/output.h"
#include "opal/mca/base/mca_base_param.h" #include "opal/mca/base/mca_base_param.h"
@ -388,6 +390,12 @@ static int parse_line(parsed_section_values_t *sv)
sv->values.use_eager_rdma_set = true; sv->values.use_eager_rdma_set = true;
} }
else if (0 == strcasecmp(key_buffer, "receive_queues")) {
/* Single value (already strdup'ed) */
sv->values.receive_queues = value;
value = NULL;
}
else { else {
/* Have no idea what this parameter is. Not an error -- just /* Have no idea what this parameter is. Not an error -- just
ignore it */ ignore it */
@ -429,6 +437,9 @@ static void hca_values_destructor(hca_values_t *s)
if (NULL != s->section_name) { if (NULL != s->section_name) {
free(s->section_name); free(s->section_name);
} }
if (NULL != s->values.receive_queues) {
free(s->values.receive_queues);
}
} }
@ -469,6 +480,8 @@ static void reset_values(ompi_btl_openib_ini_values_t *v)
v->use_eager_rdma = 0; v->use_eager_rdma = 0;
v->use_eager_rdma_set = false; v->use_eager_rdma_set = false;
v->receive_queues = NULL;
} }
@ -532,6 +545,10 @@ static int save_section(parsed_section_values_t *s)
containing bool members by value. So do a memcpy containing bool members by value. So do a memcpy
here instead. */ here instead. */
memcpy(&h->values, &s->values, sizeof(s->values)); memcpy(&h->values, &s->values, sizeof(s->values));
/* Need to strdup the string, though */
if (NULL != h->values.receive_queues) {
h->values.receive_queues = strdup(s->values.receive_queues);
}
opal_list_append(&hcas, &h->super); opal_list_append(&hcas, &h->super);
} }
} }
@ -586,14 +603,26 @@ static int intify_list(char *value, uint32_t **values, int *len)
*values[0] = (uint32_t) intify(str); *values[0] = (uint32_t) intify(str);
*len = 1; *len = 1;
} else { } else {
/* If we found a comma, loop over all the values. Be a int newsize = 1;
little clever in that we alwasy alloc enough space for
an extra value so that when we exit the loop, we don't /* Count how many values there are and allocate enough space
have to realloc again to get space for the last item. */ for them */
while (NULL != comma) {
++newsize;
str = comma + 1;
comma = strchr(str, ',');
}
*values = malloc(sizeof(uint32_t) * newsize);
if (NULL == *values) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Iterate over the values and save them */
str = value;
comma = strchr(str, ',');
do { do {
*comma = '\0'; *comma = '\0';
*values = realloc(*values, sizeof(uint32_t) * (*len + 2)); (*values)[*len] = (uint32_t) intify(str);
(*values)[*len] = (int32_t) intify(str);
++(*len); ++(*len);
str = comma + 1; str = comma + 1;
comma = strchr(str, ','); comma = strchr(str, ',');

Просмотреть файл

@ -25,6 +25,8 @@ typedef struct ompi_btl_openib_ini_values_t {
uint32_t use_eager_rdma; uint32_t use_eager_rdma;
bool use_eager_rdma_set; bool use_eager_rdma_set;
char *receive_queues;
} ompi_btl_openib_ini_values_t; } ompi_btl_openib_ini_values_t;

Просмотреть файл

@ -52,8 +52,6 @@ enum {
REGSTR_MAX = 0x88 REGSTR_MAX = 0x88
}; };
static int mca_btl_openib_mca_setup_qps(void);
/* /*
* utility routine for string parameter registration * utility routine for string parameter registration
@ -109,6 +107,9 @@ static inline int reg_int(const char* param_name, const char* param_desc,
*/ */
int btl_openib_register_mca_params(void) int btl_openib_register_mca_params(void)
{ {
char default_qps[100];
uint32_t mid_qp_size;
int i;
char *msg, *str; char *msg, *str;
int ival, ival2, ret, tmp; int ival, ival2, ret, tmp;
@ -485,7 +486,33 @@ int btl_openib_register_mca_params(void)
&mca_btl_openib_module.super)); &mca_btl_openib_module.super));
/* setup all the qp stuff */ /* setup all the qp stuff */
CHECK(mca_btl_openib_mca_setup_qps()); mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4;
/* round mid_qp_size to smallest power of two */
for(i = 31; i > 0; i--) {
if(!(mid_qp_size & (1<<i))) {
continue;
}
mid_qp_size = (1<<i);
break;
}
if(mid_qp_size <= 128) {
mid_qp_size = 1024;
}
snprintf(default_qps, 100,
"P,128,256,192,128:S,%u,256,128,32:S,%u,256,128,32:S,%u,256,128,32",
mid_qp_size,
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
CHECK(reg_string("receive_queues",
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
default_qps, &mca_btl_openib_component.receive_queues,
0));
mca_btl_openib_component.receive_queues_source =
(0 == strcmp(default_qps,
mca_btl_openib_component.receive_queues)) ?
BTL_OPENIB_RQ_SOURCE_DEFAULT : BTL_OPENIB_RQ_SOURCE_MCA;
CHECK(reg_string("if_include", CHECK(reg_string("if_include",
"Comma-delimited list of HCAs/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with btl_openib_if_exclude.", "Comma-delimited list of HCAs/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with btl_openib_if_exclude.",
@ -497,232 +524,9 @@ int btl_openib_register_mca_params(void)
NULL, &mca_btl_openib_component.if_exclude, NULL, &mca_btl_openib_component.if_exclude,
0)); 0));
return ret;
}
static int32_t atoi_param(char *param, int32_t dflt)
{
if(NULL == param || '\0' == param[0])
return dflt ? dflt : 1;
return atoi(param);
}
static int mca_btl_openib_mca_setup_qps(void)
{
/* All the multi-qp stuff.. */
char *str;
char **queues, **params = NULL;
int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0;
char default_qps[100];
uint32_t max_qp_size, max_size_needed;
int32_t min_freelist_size = 0;
int smallest_pp_qp = 0, ret = OMPI_ERROR, i;
uint32_t mid_qp_size;
mid_qp_size = mca_btl_openib_module.super.btl_eager_limit / 4;
/* round mid_qp_size to smallest power of two */
for(i = 31; i > 0; i--) {
if(!(mid_qp_size & (1<<i)))
continue;
mid_qp_size = (1<<i);
break;
}
if(mid_qp_size <= 128)
mid_qp_size = 1024;
snprintf(default_qps, 100,
"P,128,256,192,128:S,%u,256,128,32:S,%u,256,128,32:S,%u,256,128,32",
mid_qp_size,
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
reg_string("receive_queues",
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
default_qps, &str, 0);
queues = opal_argv_split(str, ':');
if (0 == opal_argv_count(queues)) {
orte_show_help("help-mpi-btl-openib.txt",
"no qps in receive_queues", true,
orte_process_info.nodename, str);
return OMPI_ERROR;
}
while (queues[qp] != NULL) {
if (0 == strncmp("P,", queues[qp], 2)) {
num_pp_qps++;
if(smallest_pp_qp > qp)
smallest_pp_qp = qp;
} else if (0 == strncmp("S,", queues[qp], 2)) {
num_srq_qps++;
} else if (0 == strncmp("X,", queues[qp], 2)) {
#if HAVE_XRC
num_xrc_qps++;
#else
orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
orte_process_info.nodename, str);
goto error;
#endif
} else {
orte_show_help("help-mpi-btl-openib.txt",
"invalid qp type in receive_queues", true,
orte_process_info.nodename, str, queues[qp]);
goto error;
}
qp++;
}
/* Current XRC implementation can't used with other QP types - PP and SRQ */
if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
orte_process_info.nodename, str);
goto error;
}
/* Current XRC implementation can't used with btls_per_lid > 1 */
if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", true,
orte_process_info.nodename, str, num_xrc_qps);
goto error;
}
mca_btl_openib_component.num_pp_qps = num_pp_qps;
mca_btl_openib_component.num_srq_qps = num_srq_qps;
mca_btl_openib_component.num_xrc_qps = num_xrc_qps;
mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps;
mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
malloc(sizeof(mca_btl_openib_qp_info_t) *
mca_btl_openib_component.num_qps);
qp = 0;
#define P(N) (((N) > count)?NULL:params[(N)])
while(queues[qp] != NULL) {
int i = 0, count;
int32_t rd_low, rd_num;
params = opal_argv_split_with_empty(queues[qp], ',');
count = opal_argv_count(params);
if ('P' == params[0][0]) {
int32_t rd_win, rd_rsv;
if (count < 3 || count > 6) {
orte_show_help("help-mpi-btl-openib.txt",
"invalid pp qp specification", true,
orte_process_info.nodename, queues[qp]);
goto error;
}
mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP;
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
rd_num = atoi_param(P(2), 256);
/* by default set rd_low to be 3/4 of rd_num */
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);
BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d",
rd_num, rd_low, rd_win, rd_rsv));
/* Calculate the smallest freelist size that can be allowed */
if (rd_num + rd_rsv > min_freelist_size)
min_freelist_size = rd_num + rd_rsv;
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
if((rd_num - rd_low) > rd_win)
orte_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
true, rd_win, rd_num - rd_low);
} else {
int32_t sd_max;
if(count < 3 || count > 5) {
orte_show_help("help-mpi-btl-openib.txt",
"invalid srq specification", true,
orte_process_info.nodename, queues[qp]);
goto error;
}
mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ?
MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP;
mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
rd_num = atoi_param(P(2), 256);
/* by default set rd_low to be 3/4 of rd_num */
rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
sd_max = atoi_param(P(4), rd_low / 4);
BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
rd_num, rd_low, sd_max));
/* Calculate the smallest freelist size that can be allowed */
if (rd_num > min_freelist_size)
min_freelist_size = rd_num;
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
}
if (rd_num <= rd_low) {
orte_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
true, orte_process_info.nodename, queues[qp]);
goto error;
}
mca_btl_openib_component.qp_infos[qp].rd_num = rd_num;
mca_btl_openib_component.qp_infos[qp].rd_low = rd_low;
while (NULL != params[i]) {
free(params[i++]);
}
free(params);
qp++;
}
params = NULL;
/* Sanity check some sizes */
max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size;
max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
mca_btl_openib_module.super.btl_max_send_size) ?
mca_btl_openib_module.super.btl_eager_limit :
mca_btl_openib_module.super.btl_max_send_size;
if (max_qp_size < max_size_needed) {
orte_show_help("help-mpi-btl-openib.txt",
"biggest qp size is too small", true,
orte_process_info.nodename, max_qp_size,
max_size_needed);
ret = OMPI_ERROR;
goto error;
} else if (max_qp_size > max_size_needed) {
orte_show_help("help-mpi-btl-openib.txt",
"biggest qp size is too big", true,
orte_process_info.nodename, max_qp_size,
max_size_needed);
orte_output(0, "The biggest QP size is bigger than maximum send size. "
"This is not optimal configuration as memory will be wasted.");
}
if (mca_btl_openib_component.ib_free_list_max > 0 &&
min_freelist_size > mca_btl_openib_component.ib_free_list_max) {
orte_show_help("help-mpi-btl-openib.txt", "freelist too small", true,
orte_process_info.nodename,
mca_btl_openib_component.ib_free_list_max,
min_freelist_size);
goto error;
}
mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
mca_btl_openib_component.credits_qp = smallest_pp_qp;
/* Register any MCA params for the connect pseudo-components */ /* Register any MCA params for the connect pseudo-components */
if (OMPI_SUCCESS != ompi_btl_openib_connect_base_register()) if (OMPI_SUCCESS == ret) {
goto error; ret = ompi_btl_openib_connect_base_register();
ret = OMPI_SUCCESS;
error:
if(params) {
qp = 0;
while(params[qp] != NULL)
free(params[qp++]);
free(params);
}
if(queues) {
qp = 0;
while(queues[qp] != NULL)
free(queues[qp++]);
free(queues);
} }
return ret; return ret;

Просмотреть файл

@ -169,6 +169,12 @@ no active ports detected. This is most certainly not what you wanted.
Check your cables and SM configuration. Check your cables and SM configuration.
# #
[error in hca init] [error in hca init]
WARNING: There was an error initializing an OpenFabrics NIC/HCA.
Hostname: %s
Device: %s
#
[error in hca init]
WARNING: There were errors during IB HCA initialization on host '%s'. WARNING: There were errors during IB HCA initialization on host '%s'.
# #
[default subnet prefix] [default subnet prefix]
@ -448,3 +454,20 @@ Can not provide %d alternative paths with LMC bit configured to %d.
[apm not enough ports] [apm not enough ports]
WARNING: For APM over ports ompi require at least 2 active ports and only single WARNING: For APM over ports ompi require at least 2 active ports and only single
active port was found. Disabling APM over ports active port was found. Disabling APM over ports
#
[conflicting receive_queues]
Open MPI detected two different sets of OpenFabrics receives queues on
the same host (in the openib BTL). Open MPI currently only supports
one set of OF receive queues in an MPI job, even if you have different
types of OpenFabrics adapters on the same host.
Host: %s
Adapter 1: %s (vendor 0x%x, part ID %d)
Queues: %s
Adapter 2: %s (vendor 0x%x, part ID %d)
Queues: %s
Note that these receive queues values may have come from the Open MPI
adapter default settings file:
%s/mca-btl-openib-hca-params.ini

Просмотреть файл

@ -139,3 +139,4 @@ vendor_id = 0x1425
vendor_part_id = 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0030,0x0031,0x0032 vendor_part_id = 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0030,0x0031,0x0032
use_eager_rdma = 1 use_eager_rdma = 1
mtu = 2048 mtu = 2048
receive_queues = P,65536,256,192,128