1
1

Support for LMC (lid mask count) and multiple QPs per port.

This commit was SVN r10536.
Этот коммит содержится в:
Gleb Natapov 2006-06-28 07:23:08 +00:00
родитель 56a86c89b1
Коммит 704a5eb645
4 изменённых файлов: 187 добавлений и 121 удалений

Просмотреть файл

@ -702,22 +702,9 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl,
*/
int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
{
/* Allocate Protection Domain */
struct ibv_context *ctx;
openib_btl->poll_cq = false;
ctx = openib_btl->ib_dev_context;
openib_btl->ib_pd = ibv_alloc_pd(ctx);
if(NULL == openib_btl->ib_pd) {
BTL_ERROR(("error allocating pd for %s errno says %s\n",
ibv_get_device_name(openib_btl->ib_dev),
strerror(errno)));
return OMPI_ERROR;
}
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
if(mca_btl_openib_component.use_srq) {
@ -728,13 +715,13 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
openib_btl->srd_posted_hp = 0;
openib_btl->srd_posted_lp = 0;
openib_btl->srq_hp = ibv_create_srq(openib_btl->ib_pd, &attr);
openib_btl->srq_hp = ibv_create_srq(openib_btl->hca->ib_pd, &attr);
if(NULL == openib_btl->srq_hp) {
BTL_ERROR(("error in ibv_create_srq\n"));
return OMPI_ERROR;
}
openib_btl->srq_lp = ibv_create_srq(openib_btl->ib_pd, &attr);
openib_btl->srq_lp = ibv_create_srq(openib_btl->hca->ib_pd, &attr);
if(NULL == openib_btl->srq_hp) {
BTL_ERROR(("error in ibv_create_srq\n"));
return OMPI_ERROR;
@ -749,32 +736,34 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
/* Create the low and high priority queue pairs */
#if OMPI_MCA_BTL_OPENIB_IBV_CREATE_CQ_ARGS == 3
openib_btl->ib_cq_lp =
ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size, NULL);
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL);
#else
openib_btl->ib_cq_lp =
ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size,
NULL, NULL, 0);
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL, NULL, 0);
#endif
if(NULL == openib_btl->ib_cq_lp) {
BTL_ERROR(("error creating low priority cq for %s errno says %s\n",
ibv_get_device_name(openib_btl->ib_dev),
ibv_get_device_name(openib_btl->hca->ib_dev),
strerror(errno)));
return OMPI_ERROR;
}
#if OMPI_MCA_BTL_OPENIB_IBV_CREATE_CQ_ARGS == 3
openib_btl->ib_cq_hp =
ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size, NULL);
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL);
#else
openib_btl->ib_cq_hp =
ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size,
NULL, NULL, 0);
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL, NULL, 0);
#endif
if(NULL == openib_btl->ib_cq_hp) {
BTL_ERROR(("error creating high priority cq for %s errno says %s\n",
ibv_get_device_name(openib_btl->ib_dev),
ibv_get_device_name(openib_btl->hca->ib_dev),
strerror(errno)));
return OMPI_ERROR;
}

Просмотреть файл

@ -25,6 +25,7 @@
/* Standard system includes */
#include <sys/types.h>
#include <string.h>
#include <infiniband/verbs.h>
/* Open MPI includes */
#include "ompi/class/ompi_free_list.h"
@ -114,11 +115,12 @@ struct mca_btl_openib_component_t {
uint32_t ib_max_rdma_dst_ops;
uint32_t ib_service_level;
uint32_t ib_static_rate;
uint32_t ib_src_path_bits;
uint32_t use_eager_rdma;
uint32_t eager_rdma_threshold;
uint32_t eager_rdma_num;
uint32_t max_eager_rdma;
uint32_t btls_per_lid;
uint32_t max_lmc;
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
@ -127,7 +129,15 @@ extern mca_btl_openib_component_t mca_btl_openib_component;
typedef mca_btl_base_recv_reg_t mca_btl_openib_recv_reg_t;
struct mca_btl_openib_hca_t {
struct ibv_device *ib_dev; /* the ib device */
struct ibv_context *ib_dev_context;
struct ibv_device_attr ib_dev_attr;
struct ibv_pd *ib_pd;
mca_mpool_base_module_t *mpool;
uint8_t btls; /** < number of btls using this HCA */
};
typedef struct mca_btl_openib_hca_t mca_btl_openib_hca_t;
/**
* IB PTL Interface
*/
@ -136,14 +146,14 @@ struct mca_btl_openib_module_t {
bool btl_inited;
mca_btl_openib_recv_reg_t ib_reg[256];
mca_btl_openib_port_info_t port_info; /* contains only the subnet right now */
mca_btl_openib_hca_t *hca;
uint8_t port_num; /**< ID of the PORT */
struct ibv_device *ib_dev; /* the ib device */
struct ibv_context *ib_dev_context;
struct ibv_pd *ib_pd;
struct ibv_cq *ib_cq_hp;
struct ibv_cq *ib_cq_lp;
struct ibv_port_attr* ib_port_attr;
struct ibv_port_attr ib_port_attr;
struct ibv_recv_wr* rd_desc_post;
uint16_t lid; /**< lid that is actually used (for LMC) */
uint8_t src_path_bits; /**< offset from base lid (for LMC) */
ompi_free_list_t send_free_eager; /**< free list of eager buffer descriptors */
ompi_free_list_t send_free_max; /**< free list of max buffer descriptors */

Просмотреть файл

@ -182,8 +182,6 @@ int mca_btl_openib_component_open(void)
0, (int*) &mca_btl_openib_component.ib_service_level);
mca_btl_openib_param_register_int("ib_static_rate", "IB static rate",
0, (int*) &mca_btl_openib_component.ib_static_rate);
mca_btl_openib_param_register_int("ib_src_path_bits", "IB source path bits",
0, (int*) &mca_btl_openib_component.ib_src_path_bits);
mca_btl_openib_param_register_int ("exclusivity", "BTL exclusivity",
MCA_BTL_EXCLUSIVITY_DEFAULT, (int*) &mca_btl_openib_module.super.btl_exclusivity);
mca_btl_openib_param_register_int("rd_num", "number of receive descriptors to post to a QP",
@ -211,6 +209,10 @@ int mca_btl_openib_component_open(void)
mca_btl_openib_param_register_int("eager_rdma_num", "Number of RDMA buffers for eager messages",
16, (int*)&mca_btl_openib_component.eager_rdma_num);
mca_btl_openib_component.eager_rdma_num+=1;
mca_btl_openib_param_register_int("btls_per_lid", "Number of BTLs to create for each LID",
1, (int*)&mca_btl_openib_component.btls_per_lid);
mca_btl_openib_param_register_int("max_lmc", "Maximum LIDs to use for each port (0 - all available)",
0, (int*)&mca_btl_openib_component.max_lmc);
mca_btl_openib_param_register_int ("eager_limit", "eager send limit",
(12*1024),(int*) &mca_btl_openib_module.super.btl_eager_limit);
mca_btl_openib_param_register_int ("min_send_size", "minimum send size",
@ -321,8 +323,130 @@ static void mca_btl_openib_control(
}
}
static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
uint8_t port_num, struct ibv_port_attr *ib_port_attr)
{
uint16_t lid, i, lmc;
mca_btl_openib_module_t *openib_btl;
mca_btl_base_selected_module_t *ib_selected;
lmc = (1 << ib_port_attr->lmc);
if(mca_btl_openib_component.max_lmc &&
mca_btl_openib_component.max_lmc < lmc)
lmc = mca_btl_openib_component.max_lmc;
for(lid = ib_port_attr->lid;
lid < ib_port_attr->lid + lmc; lid++){
for(i = 0; i < mca_btl_openib_component.btls_per_lid; i++){
openib_btl = malloc(sizeof(mca_btl_openib_module_t));
if(NULL == openib_btl) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return -1;
}
memcpy(openib_btl, &mca_btl_openib_module,
sizeof(mca_btl_openib_module));
memcpy(&openib_btl->ib_port_attr, ib_port_attr,
sizeof(struct ibv_port_attr));
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
openib_btl->hca = hca;
openib_btl->port_num = (uint8_t) port_num;
openib_btl->lid = lid;
openib_btl->src_path_bits = lid - ib_port_attr->lid;
/* store the sm_lid for multi-nic support */
openib_btl->port_info.subnet = ib_port_attr->sm_lid;
openib_btl->ib_reg[MCA_BTL_TAG_BTL].cbfunc =
mca_btl_openib_control;
openib_btl->ib_reg[MCA_BTL_TAG_BTL].cbdata = NULL;
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
hca->btls++;
if(++mca_btl_openib_component.ib_num_btls >=
mca_btl_openib_component.ib_max_btls)
return 0;
}
}
return 1;
}
static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
{
struct mca_mpool_base_resources_t mpool_resources;
mca_btl_openib_hca_t *hca;
uint8_t i;
int ret = -1;
hca = malloc(sizeof(mca_btl_openib_hca_t));
if(NULL == hca){
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return -1;
}
hca->ib_dev = ib_dev;
hca->ib_dev_context = ibv_open_device(ib_dev);
hca->btls = 0;
if(NULL == hca->ib_dev_context){
BTL_ERROR(("error obtaining device context for %s errno says %s\n",
ibv_get_device_name(ib_dev), strerror(errno)));
goto free_hca;
}
if(ibv_query_device(hca->ib_dev_context, &hca->ib_dev_attr)){
BTL_ERROR(("error obtaining device attributes for %s errno says %s\n",
ibv_get_device_name(ib_dev), strerror(errno)));
goto close_hca;
}
hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context);
if(NULL == hca->ib_pd){
BTL_ERROR(("error allocating pd for %s errno says %s\n",
ibv_get_device_name(ib_dev), strerror(errno)));
goto close_hca;
}
mpool_resources.ib_pd = hca->ib_pd;
hca->mpool =
mca_mpool_base_module_create(mca_btl_openib_component.ib_mpool_name,
hca, &mpool_resources);
if(NULL == hca->mpool){
BTL_ERROR(("error creating IB memory pool for %s errno says %s\n",
ibv_get_device_name(ib_dev), strerror(errno)));
goto dealloc_pd;
}
ret = 1;
/* Note ports are 1 based hence j = 1 */
for(i = 1; i <= hca->ib_dev_attr.phys_port_cnt; i++){
struct ibv_port_attr ib_port_attr;
if(ibv_query_port(hca->ib_dev_context, i, &ib_port_attr)){
BTL_ERROR(("error getting port attributes for device %s "
"port number %d errno says %s",
ibv_get_device_name(ib_dev), i, strerror(errno)));
break;
}
if(IBV_PORT_ACTIVE == ib_port_attr.state){
ret = init_one_port(btl_list, hca, i, &ib_port_attr);
if (ret <= 0)
break;
}
}
if (hca->btls != 0)
return ret;
mca_mpool_base_module_destroy(hca->mpool);
dealloc_pd:
ibv_dealloc_pd(hca->ib_pd);
close_hca:
ibv_close_device(hca->ib_dev_context);
free_hca:
free(hca);
return ret;
}
/*
* IB component initialization:
* (1) read interface list from kernel and compare against component parameters
@ -337,16 +461,15 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
{
struct ibv_device **ib_devs;
mca_btl_base_module_t** btls;
int i,j, length, num_devs;
struct mca_mpool_base_resources_t mpool_resources;
int i, length, num_devs;
opal_list_t btl_list;
mca_btl_openib_module_t * openib_btl;
mca_btl_base_selected_module_t* ib_selected;
opal_list_item_t* item;
#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST == 0
struct dlist *dev_list;
#endif
struct ibv_device* ib_dev;
#endif
unsigned short seedv[3];
@ -415,73 +538,23 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
for(i = 0; i < num_devs
&& mca_btl_openib_component.ib_num_btls < mca_btl_openib_component.ib_max_btls; i++){
struct ibv_device_attr ib_dev_attr;
struct ibv_context* ib_dev_context;
ib_dev = ib_devs[i];
&& mca_btl_openib_component.ib_num_btls <
mca_btl_openib_component.ib_max_btls; i++){
if (init_one_hca(&btl_list, ib_devs[i]) <= 0)
break;
}
ib_dev_context = ibv_open_device(ib_dev);
if(!ib_dev_context) {
BTL_ERROR((" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)));
return NULL;
}
if(ibv_query_device(ib_dev_context, &ib_dev_attr)){
BTL_ERROR(("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)));
return NULL;
}
/* Note ports are 1 based hence j = 1 */
for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++){
struct ibv_port_attr* ib_port_attr;
ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr));
if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){
BTL_ERROR(("error getting port attributes for device %s port number %d errno says %s",
ibv_get_device_name(ib_dev), j, strerror(errno)));
return NULL;
}
if( IBV_PORT_ACTIVE == ib_port_attr->state ){
openib_btl = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t));
memcpy(openib_btl, &mca_btl_openib_module, sizeof(mca_btl_openib_module));
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
openib_btl->ib_dev = ib_dev;
openib_btl->ib_dev_context = ib_dev_context;
openib_btl->port_num = (uint8_t) j;
openib_btl->ib_port_attr = ib_port_attr;
openib_btl->port_info.subnet = ib_port_attr->sm_lid; /* store the sm_lid for multi-nic support */
openib_btl->ib_reg[MCA_BTL_TAG_BTL].cbfunc = mca_btl_openib_control;
openib_btl->ib_reg[MCA_BTL_TAG_BTL].cbdata = NULL;
opal_list_append(&btl_list, (opal_list_item_t*) ib_selected);
if(++mca_btl_openib_component.ib_num_btls >= mca_btl_openib_component.ib_max_btls)
break;
}
else{
free(ib_port_attr);
}
}
}
/* Allocate space for btl modules */
mca_btl_openib_component.openib_btls = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t) *
mca_btl_openib_component.ib_num_btls);
mca_btl_openib_component.openib_btls =
malloc(sizeof(mca_btl_openib_module_t) *
mca_btl_openib_component.ib_num_btls);
if(NULL == mca_btl_openib_component.openib_btls) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}
btls = (struct mca_btl_base_module_t**)
malloc(mca_btl_openib_component.ib_num_btls * sizeof(struct mca_btl_openib_module_t*));
btls = malloc(mca_btl_openib_component.ib_num_btls *
sizeof(struct mca_btl_openib_module_t*));
if(NULL == btls) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
@ -493,15 +566,18 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
item = opal_list_remove_first(&btl_list);
ib_selected = (mca_btl_base_selected_module_t*)item;
openib_btl = (mca_btl_openib_module_t*) ib_selected->btl_module;
memcpy(&(mca_btl_openib_component.openib_btls[i]), openib_btl , sizeof(mca_btl_openib_module_t));
memcpy(&(mca_btl_openib_component.openib_btls[i]), openib_btl,
sizeof(mca_btl_openib_module_t));
free(ib_selected);
free(openib_btl);
openib_btl = &mca_btl_openib_component.openib_btls[i];
openib_btl->rd_num = mca_btl_openib_component.rd_num + mca_btl_openib_component.rd_rsv;
openib_btl->rd_num = mca_btl_openib_component.rd_num +
mca_btl_openib_component.rd_rsv;
openib_btl->rd_low = mca_btl_openib_component.rd_low;
openib_btl->num_peers = 0;
openib_btl->sd_tokens_hp = openib_btl->sd_tokens_lp = mca_btl_openib_component.srq_sd_max;
openib_btl->sd_tokens_hp = openib_btl->sd_tokens_lp =
mca_btl_openib_component.srq_sd_max;
/* Initialize module state */
@ -518,25 +594,15 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
if(mca_btl_openib_module_init(openib_btl) != OMPI_SUCCESS) {
#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST
ibv_free_device_list(ib_devs);
ibv_free_device_list(ib_devs);
#else
free(ib_devs);
free(ib_devs);
#endif
return NULL;
return NULL;
}
mpool_resources.ib_pd = openib_btl->ib_pd;
/* initialize the memory pool using the hca */
openib_btl->super.btl_mpool =
mca_mpool_base_module_create(mca_btl_openib_component.ib_mpool_name,
&openib_btl->super,
&mpool_resources);
if(NULL == openib_btl->super.btl_mpool) {
BTL_ERROR(("error creating vapi memory pool! aborting openib btl initialization"));
return NULL;
}
openib_btl->super.btl_mpool = openib_btl->hca->mpool;
/* Initialize pool of send fragments */
length = sizeof(mca_btl_openib_frag_t) +

Просмотреть файл

@ -333,7 +333,7 @@ static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* en
return rc;
}
rc = orte_dss.pack(buffer, &endpoint->endpoint_btl->ib_port_attr->lid, 1, ORTE_UINT16);
rc = orte_dss.pack(buffer, &endpoint->endpoint_btl->lid, 1, ORTE_UINT16);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
@ -354,7 +354,7 @@ static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* en
BTL_VERBOSE(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_hp->qp_num,
endpoint->lcl_qp_lp->qp_num,
endpoint->endpoint_btl->ib_port_attr->lid));
endpoint->endpoint_btl->lid));
if(rc < 0) {
ORTE_ERROR_LOG(rc);
@ -401,7 +401,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
/* Create the High Priority Queue Pair */
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
openib_btl->ib_pd,
openib_btl->hca->ib_pd,
openib_btl->ib_cq_hp,
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
openib_btl->srq_hp,
@ -416,7 +416,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
/* Create the Low Priority Queue Pair */
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
openib_btl->ib_pd,
openib_btl->hca->ib_pd,
openib_btl->ib_cq_lp,
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
openib_btl->srq_lp,
@ -431,7 +431,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_hp->qp_num,
endpoint->lcl_qp_lp->qp_num,
openib_btl->ib_port_attr->lid));
openib_btl->lid));
/* Send connection info over to remote endpoint */
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
@ -455,7 +455,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
/* Create the High Priority Queue Pair */
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
openib_btl->ib_pd,
openib_btl->hca->ib_pd,
openib_btl->ib_cq_hp,
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
openib_btl->srq_hp,
@ -471,7 +471,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
/* Create the Low Priority Queue Pair */
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl,
openib_btl->ib_pd,
openib_btl->hca->ib_pd,
openib_btl->ib_cq_lp,
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
openib_btl->srq_lp,
@ -487,7 +487,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
BTL_VERBOSE(("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_hp->qp_num,
endpoint->lcl_qp_lp->qp_num,
openib_btl->ib_port_attr->lid));
openib_btl->lid));
/* Set the remote side info */
@ -654,7 +654,8 @@ static void mca_btl_openib_endpoint_recv(
port_info = ib_proc->proc_ports[i];
ib_endpoint = ib_proc->proc_endpoints[i];
if(ib_endpoint->rem_info.rem_lid &&
ib_endpoint->rem_info.rem_lid == rem_info.rem_lid) {
(ib_endpoint->rem_info.rem_lid == rem_info.rem_lid &&
ib_endpoint->rem_info.rem_qp_num_hp == rem_info.rem_qp_num_hp)) {
/* we've seen them before! */
found = true;
break;
@ -992,7 +993,7 @@ int mca_btl_openib_endpoint_qp_init_query(
attr->ah_attr.is_global = 0;
attr->ah_attr.dlid = rem_lid;
attr->ah_attr.sl = mca_btl_openib_component.ib_service_level;
attr->ah_attr.src_path_bits = mca_btl_openib_component.ib_src_path_bits;
attr->ah_attr.src_path_bits = openib_btl->src_path_bits;
attr->ah_attr.port_num = port_num;
if(ibv_modify_qp(qp, attr,