1
1

Put send completions to low prio CQ. Receive is more important.

This commit was SVN r16817.
Этот коммит содержится в:
Gleb Natapov 2007-12-02 14:46:37 +00:00
родитель b17f5b7480
Коммит a774cd98f8
6 изменённых файлов: 47 добавлений и 72 удалений

Просмотреть файл

@ -287,12 +287,10 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
openib_btl->qps[qp].u.srq_qp.rd_posted = 0; openib_btl->qps[qp].u.srq_qp.rd_posted = 0;
#if HAVE_XRC #if HAVE_XRC
if(BTL_OPENIB_QP_TYPE_XRC(qp)) { if(BTL_OPENIB_QP_TYPE_XRC(qp)) {
int prio = qp_cq_prio(qp);
openib_btl->qps[qp].u.srq_qp.srq = openib_btl->qps[qp].u.srq_qp.srq =
ibv_create_xrc_srq(openib_btl->hca->ib_pd, ibv_create_xrc_srq(openib_btl->hca->ib_pd,
openib_btl->hca->xrc_domain, openib_btl->hca->xrc_domain,
openib_btl->hca->ib_cq[prio], &attr); openib_btl->hca->ib_cq[qp_cq_prio(qp)], &attr);
openib_btl->hca->cq_users[prio]++;
} else } else
#endif #endif
{ {
@ -310,8 +308,20 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
static int adjust_cq(mca_btl_openib_hca_t *hca, const int cq_size, const int cq)
static int adjust_cq(mca_btl_openib_hca_t *hca, const int cq)
{ {
uint32_t cq_size = hca->cq_size[cq];
/* make sure we don't exceed the maximum CQ size and that we
* don't size the queue smaller than otherwise requested
*/
if(cq_size < mca_btl_openib_component.ib_cq_size[cq])
cq_size = mca_btl_openib_component.ib_cq_size[cq];
if(cq_size > (uint32_t)hca->ib_dev_attr.max_cq)
cq_size = hca->ib_dev_attr.max_cq;
if(NULL == hca->ib_cq[cq]) { if(NULL == hca->ib_cq[cq]) {
hca->ib_cq[cq] = ibv_create_cq_compat(hca->ib_dev_context, cq_size, hca->ib_cq[cq] = ibv_create_cq_compat(hca->ib_dev_context, cq_size,
#if OMPI_ENABLE_PROGRESS_THREADS == 1 #if OMPI_ENABLE_PROGRESS_THREADS == 1
@ -347,7 +357,7 @@ static int adjust_cq(mca_btl_openib_hca_t *hca, const int cq_size, const int cq)
#endif #endif
} }
#ifdef HAVE_IBV_RESIZE_CQ #ifdef HAVE_IBV_RESIZE_CQ
else { else if (cq_size > mca_btl_openib_component.ib_cq_size[cq]){
int rc; int rc;
rc = ibv_resize_cq(hca->ib_cq[cq], cq_size); rc = ibv_resize_cq(hca->ib_cq[cq], cq_size);
/* For ConnectX the resize CQ is not implemented and verbs returns -ENOSYS /* For ConnectX the resize CQ is not implemented and verbs returns -ENOSYS
@ -358,66 +368,37 @@ static int adjust_cq(mca_btl_openib_hca_t *hca, const int cq_size, const int cq)
} }
} }
#endif #endif
hca->cq_size[cq] = cq_size;
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
static int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs) static int mca_btl_openib_size_queues(struct mca_btl_openib_module_t* openib_btl, size_t nprocs)
{ {
uint32_t min_hp_cq_size = openib_btl->hca->cq_size[BTL_OPENIB_HP_CQ], uint32_t send_cqes, recv_cqes;
min_lp_cq_size = openib_btl->hca->cq_size[BTL_OPENIB_HP_CQ],
cq_size;
int rc = OMPI_SUCCESS, qp; int rc = OMPI_SUCCESS, qp;
mca_btl_openib_hca_t *hca = openib_btl->hca; mca_btl_openib_hca_t *hca = openib_btl->hca;
/* figure out reasonable sizes for completion queues */ /* figure out reasonable sizes for completion queues */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(BTL_OPENIB_QP_TYPE_SRQ(qp)) { if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
cq_size = mca_btl_openib_component.qp_infos[qp].rd_num + send_cqes = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; recv_cqes = mca_btl_openib_component.qp_infos[qp].rd_num;
if(mca_btl_openib_component.qp_infos[qp].size <=
mca_btl_openib_component.eager_limit) {
min_hp_cq_size += cq_size;
} else {
min_lp_cq_size += cq_size;
}
} else { } else {
cq_size = (mca_btl_openib_component.qp_infos[qp].rd_num + send_cqes = (mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) * mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) * nprocs;
2 * nprocs; recv_cqes = send_cqes;
if(mca_btl_openib_component.qp_infos[qp].size <=
mca_btl_openib_component.eager_limit) {
min_hp_cq_size += cq_size;
} else {
min_lp_cq_size += cq_size;
}
} }
openib_btl->hca->cq_size[qp_cq_prio(qp)] += recv_cqes;
openib_btl->hca->cq_size[BTL_OPENIB_LP_CQ] += send_cqes;
} }
/* make sure we don't exceed the maximum CQ size and that we rc = adjust_cq(hca, BTL_OPENIB_HP_CQ);
* don't size the queue smaller than otherwise requested if(rc != OMPI_SUCCESS)
*/ goto out;
if(min_lp_cq_size < mca_btl_openib_component.ib_lp_cq_size)
min_lp_cq_size = mca_btl_openib_component.ib_lp_cq_size;
if(min_lp_cq_size > (uint32_t)openib_btl->hca->ib_dev_attr.max_cq)
min_lp_cq_size = openib_btl->hca->ib_dev_attr.max_cq;
if(min_hp_cq_size < mca_btl_openib_component.ib_hp_cq_size) rc = adjust_cq(hca, BTL_OPENIB_LP_CQ);
min_hp_cq_size = mca_btl_openib_component.ib_hp_cq_size; if(rc != OMPI_SUCCESS)
if(min_hp_cq_size > (uint32_t)openib_btl->hca->ib_dev_attr.max_cq) goto out;
min_hp_cq_size = openib_btl->hca->ib_dev_attr.max_cq;
if(min_hp_cq_size != hca->cq_size[BTL_OPENIB_HP_CQ]) {
rc = adjust_cq(hca, min_hp_cq_size, BTL_OPENIB_HP_CQ);
if(rc != OMPI_SUCCESS)
goto out;
}
if(min_lp_cq_size != hca->cq_size[BTL_OPENIB_LP_CQ]) {
rc = adjust_cq(hca, min_lp_cq_size, BTL_OPENIB_LP_CQ);
if(rc != OMPI_SUCCESS)
goto out;
}
if(0 == openib_btl->num_peers) if(0 == openib_btl->num_peers)
rc = create_srq(openib_btl); rc = create_srq(openib_btl);

Просмотреть файл

@ -144,8 +144,7 @@ struct mca_btl_openib_component_t {
uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */ uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */
uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */ uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */
uint32_t ib_lp_cq_size; /**< Max outstanding CQE on the CQ */ uint32_t ib_cq_size[2]; /**< Max outstanding CQE on the CQ */
uint32_t ib_hp_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ */ uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ */
uint32_t ib_pkey_ix; /**< InfiniBand pkey index */ uint32_t ib_pkey_ix; /**< InfiniBand pkey index */
@ -253,7 +252,6 @@ struct mca_btl_openib_hca_t {
struct ibv_device_attr ib_dev_attr; struct ibv_device_attr ib_dev_attr;
struct ibv_pd *ib_pd; struct ibv_pd *ib_pd;
struct ibv_cq *ib_cq[2]; struct ibv_cq *ib_cq[2];
uint32_t cq_users[2];
uint32_t cq_size[2]; uint32_t cq_size[2];
mca_mpool_base_module_t *mpool; mca_mpool_base_module_t *mpool;
/* MTU for this HCA */ /* MTU for this HCA */

Просмотреть файл

@ -495,8 +495,6 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
hca->btls = 0; hca->btls = 0;
hca->ib_cq[BTL_OPENIB_HP_CQ] = NULL; hca->ib_cq[BTL_OPENIB_HP_CQ] = NULL;
hca->ib_cq[BTL_OPENIB_LP_CQ] = NULL; hca->ib_cq[BTL_OPENIB_LP_CQ] = NULL;
hca->cq_users[BTL_OPENIB_HP_CQ] = 0;
hca->cq_users[BTL_OPENIB_LP_CQ] = 0;
hca->cq_size[BTL_OPENIB_HP_CQ] = 0; hca->cq_size[BTL_OPENIB_HP_CQ] = 0;
hca->cq_size[BTL_OPENIB_LP_CQ] = 0; hca->cq_size[BTL_OPENIB_LP_CQ] = 0;
OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t); OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t);
@ -1550,21 +1548,26 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
{ {
static char *cq_name[] = {"HP CQ", "LP CQ"}; static char *cq_name[] = {"HP CQ", "LP CQ"};
int cq, qp; int cq, qp;
int count = 0,ne = 0; int count = 0, ne = 0;
mca_btl_openib_com_frag_t* frag; mca_btl_openib_com_frag_t* frag;
mca_btl_base_descriptor_t *des; mca_btl_base_descriptor_t *des;
mca_btl_openib_endpoint_t* endpoint; mca_btl_openib_endpoint_t* endpoint;
mca_btl_openib_module_t *openib_btl = NULL; mca_btl_openib_module_t *openib_btl = NULL;
struct ibv_wc wc; struct ibv_wc wc;
for(cq = 0; cq < 2; cq++) { for(cq = 0; cq < 2;) {
if(0 == hca->cq_users[cq])
continue;
ne = ibv_poll_cq(hca->ib_cq[cq], 1, &wc); ne = ibv_poll_cq(hca->ib_cq[cq], 1, &wc);
if(0 == ne) if(0 == ne) {
/* don't check low prio cq if there was something in high prio cq */
if(count)
break;
cq++;
continue; continue;
}
if(ne < 0) if(ne < 0)
goto error; goto error;
count++;
des = (mca_btl_base_descriptor_t*)(uintptr_t)wc.wr_id; des = (mca_btl_base_descriptor_t*)(uintptr_t)wc.wr_id;
frag = to_com_frag(des); frag = to_com_frag(des);
@ -1604,8 +1607,6 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
/* new wqe or/and get token available. Try to progress pending frags */ /* new wqe or/and get token available. Try to progress pending frags */
progress_pending_frags_wqe(endpoint->qps[qp].qp); progress_pending_frags_wqe(endpoint->qps[qp].qp);
mca_btl_openib_frag_progress_pending_put_get(endpoint, qp); mca_btl_openib_frag_progress_pending_put_get(endpoint, qp);
count++;
break; break;
case IBV_WC_RECV: case IBV_WC_RECV:
if(wc.wc_flags & IBV_WC_WITH_IMM) { if(wc.wc_flags & IBV_WC_WITH_IMM) {
@ -1623,8 +1624,6 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
return 0; return 0;
} }
count++;
/* decide if it is time to setup an eager rdma channel */ /* decide if it is time to setup an eager rdma channel */
if (!endpoint->eager_rdma_local.base.pval && if (!endpoint->eager_rdma_local.base.pval &&
endpoint->use_eager_rdma && endpoint->use_eager_rdma &&
@ -1640,7 +1639,8 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
BTL_ERROR(("Unhandled work completion opcode is %d", BTL_ERROR(("Unhandled work completion opcode is %d",
wc.opcode)); wc.opcode));
if(openib_btl) if(openib_btl)
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL); openib_btl->error_cb(&openib_btl->super,
MCA_BTL_ERROR_FLAGS_FATAL);
break; break;
} }
} }

Просмотреть файл

@ -214,8 +214,8 @@ int btl_openib_register_mca_params(void)
"queue (will automatically be set to a minimum of " "queue (will automatically be set to a minimum of "
"(2 * number_of_peers * btl_openib_rd_num))", "(2 * number_of_peers * btl_openib_rd_num))",
1000, &ival, REGINT_GE_ONE)); 1000, &ival, REGINT_GE_ONE));
mca_btl_openib_component.ib_lp_cq_size = mca_btl_openib_component.ib_cq_size[BTL_OPENIB_LP_CQ] =
mca_btl_openib_component.ib_hp_cq_size = (uint32_t) ival; mca_btl_openib_component.ib_cq_size[BTL_OPENIB_HP_CQ] = (uint32_t) ival;
CHECK(reg_int("ib_sg_list_size", "Size of IB segment list " CHECK(reg_int("ib_sg_list_size", "Size of IB segment list "
"(must be >= 1)", "(must be >= 1)",

Просмотреть файл

@ -330,14 +330,13 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
struct ibv_qp *my_qp; struct ibv_qp *my_qp;
struct ibv_qp_init_attr init_attr; struct ibv_qp_init_attr init_attr;
struct ibv_qp_attr attr; struct ibv_qp_attr attr;
int prio = qp_cq_prio(qp);
memset(&init_attr, 0, sizeof(init_attr)); memset(&init_attr, 0, sizeof(init_attr));
memset(&attr, 0, sizeof(attr)); memset(&attr, 0, sizeof(attr));
init_attr.qp_type = IBV_QPT_RC; init_attr.qp_type = IBV_QPT_RC;
init_attr.send_cq = openib_btl->hca->ib_cq[prio]; init_attr.send_cq = openib_btl->hca->ib_cq[BTL_OPENIB_LP_CQ];
init_attr.recv_cq = openib_btl->hca->ib_cq[prio]; init_attr.recv_cq = openib_btl->hca->ib_cq[qp_cq_prio(qp)];
init_attr.srq = srq; init_attr.srq = srq;
init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size; init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size; init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size;
@ -371,7 +370,6 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
/* Setup meta data on the endpoint */ /* Setup meta data on the endpoint */
endpoint->qps[qp].qp->lcl_psn = lrand48() & 0xffffff; endpoint->qps[qp].qp->lcl_psn = lrand48() & 0xffffff;
endpoint->qps[qp].credit_frag = NULL; endpoint->qps[qp].credit_frag = NULL;
openib_btl->hca->cq_users[prio]++;
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -250,8 +250,6 @@ static int xoob_qp_create(mca_btl_base_endpoint_t* endpoint, xoob_qp_type type)
qp_init_attr.send_cq = qp_init_attr.recv_cq = openib_btl->hca->ib_cq[prio]; qp_init_attr.send_cq = qp_init_attr.recv_cq = openib_btl->hca->ib_cq[prio];
openib_btl->hca->cq_users[prio]++;
/* no need recv queue; receives are posted to srq */ /* no need recv queue; receives are posted to srq */
qp_init_attr.cap.max_recv_wr = 0; qp_init_attr.cap.max_recv_wr = 0;
qp_init_attr.cap.max_send_wr = send_wr; qp_init_attr.cap.max_send_wr = send_wr;