1
1

Fix up error handling in openib.. Added a simple debug test for memory

registration.. 

This commit was SVN r6520.
Этот коммит содержится в:
Galen Shipman 2005-07-15 15:13:19 +00:00
родитель 213be28613
Коммит b75560796c
8 изменённых файлов: 185 добавлений и 165 удалений

Просмотреть файл

@ -30,6 +30,9 @@
#include "mca/mpool/base/base.h" #include "mca/mpool/base/base.h"
#include "mca/mpool/mpool.h" #include "mca/mpool/mpool.h"
#include "mca/mpool/openib/mpool_openib.h" #include "mca/mpool/openib/mpool_openib.h"
#include <errno.h>
#include <string.h>
extern int errno;
mca_btl_openib_module_t mca_btl_openib_module = { mca_btl_openib_module_t mca_btl_openib_module = {
{ {
@ -275,11 +278,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
OBJ_RELEASE(openib_reg); OBJ_RELEASE(openib_reg);
openib_btl->ib_pool->mpool_register(openib_btl->ib_pool, openib_btl->ib_pool->mpool_register(openib_btl->ib_pool,
base_addr, base_addr,
new_len, new_len,
(mca_mpool_base_registration_t**) &openib_reg); (mca_mpool_base_registration_t**) &openib_reg);
rc = mca_mpool_base_insert(openib_reg->base_reg.base, rc = mca_mpool_base_insert(openib_reg->base_reg.base,
openib_reg->base_reg.bound - openib_reg->base_reg.base + 1, openib_reg->base_reg.bound - openib_reg->base_reg.base + 1,
@ -405,7 +406,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
frag->base.des_dst = NULL; frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0; frag->base.des_dst_cnt = 0;
frag->openib_reg = openib_reg; frag->openib_reg = openib_reg;
OBJ_RETAIN(openib_reg);
return &frag->base; return &frag->base;
} else if (max_data+reserve <= btl->btl_eager_limit) { } else if (max_data+reserve <= btl->btl_eager_limit) {
@ -720,7 +720,7 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
if(ibv_post_send(endpoint->lcl_qp_low, if(ibv_post_send(endpoint->lcl_qp_low,
&frag->sr_desc, &frag->sr_desc,
&bad_wr)){ &bad_wr)){
opal_output(0, "%s: error posting send request\n", __func__); opal_output(0, "%s: error posting send request errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
@ -792,21 +792,30 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
if(NULL == openib_btl->ib_pd) { if(NULL == openib_btl->ib_pd) {
opal_output(0, "%s: error allocating pd for %s\n", __func__, ibv_get_device_name(openib_btl->ib_dev)); opal_output(0, "%s: error allocating pd for %s errno says %s\n",
__func__,
ibv_get_device_name(openib_btl->ib_dev),
strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
openib_btl->ib_cq_low = ibv_create_cq(ctx, openib_btl->ib_cq_size, NULL); openib_btl->ib_cq_low = ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size, NULL);
if(NULL == openib_btl->ib_cq_low) { if(NULL == openib_btl->ib_cq_low) {
opal_output(0, "%s: error creating low priority cq for %s\n", __func__, ibv_get_device_name(openib_btl->ib_dev)); opal_output(0, "%s: error creating low priority cq for %s errno says %s\n",
__func__,
ibv_get_device_name(openib_btl->ib_dev),
strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
openib_btl->ib_cq_high = ibv_create_cq(ctx, openib_btl->ib_cq_size, NULL); openib_btl->ib_cq_high = ibv_create_cq(ctx, mca_btl_openib_component.ib_cq_size, NULL);
if(NULL == openib_btl->ib_cq_high) { if(NULL == openib_btl->ib_cq_high) {
opal_output(0, "%s: error creating high priority cq for %s\n", __func__, ibv_get_device_name(openib_btl->ib_dev)); opal_output(0, "%s: error creating high priority cq for %s errno says %s\n",
__func__,
ibv_get_device_name(openib_btl->ib_dev),
strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }

Просмотреть файл

@ -95,6 +95,22 @@ struct mca_btl_openib_component_t {
uint32_t leave_pinned; uint32_t leave_pinned;
uint32_t reg_mru_len; uint32_t reg_mru_len;
uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_wq_size; /**< Max outstanding WR on the WQ */
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ*/
uint32_t ib_pkey_ix;
uint32_t ib_psn;
uint32_t ib_qp_ous_rd_atom;
uint32_t ib_mtu;
uint32_t ib_min_rnr_timer;
uint32_t ib_timeout;
uint32_t ib_retry_count;
uint32_t ib_rnr_retry;
uint32_t ib_max_rdma_dst_ops;
uint32_t ib_service_level;
uint32_t ib_static_rate;
uint32_t ib_src_path_bits;
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t; }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
@ -147,23 +163,8 @@ struct mca_btl_openib_module_t {
/**< an array to allow posting of rr in one swoop */ /**< an array to allow posting of rr in one swoop */
size_t ib_inline_max; /**< max size of inline send*/ size_t ib_inline_max; /**< max size of inline send*/
size_t ib_pin_min; /**< min size to pin memory*/
uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_wq_size; /**< Max outstanding WR on the WQ */
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ*/
uint32_t ib_pkey_ix;
uint32_t ib_psn;
uint32_t ib_qp_ous_rd_atom;
uint32_t ib_mtu;
uint32_t ib_min_rnr_timer;
uint32_t ib_timeout;
uint32_t ib_retry_count;
uint32_t ib_rnr_retry;
uint32_t ib_max_rdma_dst_ops;
uint32_t ib_service_level;
uint32_t ib_static_rate;
uint32_t ib_src_path_bits;
}; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;

Просмотреть файл

@ -37,8 +37,10 @@
#include "mca/mpool/mvapi/mpool_mvapi.h" #include "mca/mpool/mvapi/mpool_mvapi.h"
#include <sysfs/libsysfs.h> #include <sysfs/libsysfs.h>
#include <infiniband/verbs.h> #include <infiniband/verbs.h>
#include <errno.h>
#include <string.h> /* for strerror()*/
extern int errno;
mca_btl_openib_component_t mca_btl_openib_component = { mca_btl_openib_component_t mca_btl_openib_component = {
{ {
/* First, the mca_base_component_t struct containing meta information /* First, the mca_base_component_t struct containing meta information
@ -132,6 +134,55 @@ int mca_btl_openib_component_open(void)
mca_btl_openib_component.reg_mru_len = mca_btl_openib_component.reg_mru_len =
mca_btl_openib_param_register_int("reg_mru_len", 16); mca_btl_openib_param_register_int("reg_mru_len", 16);
mca_btl_openib_component.ib_cq_size =
mca_btl_openib_param_register_int("ib_cq_size",
500);
mca_btl_openib_component.ib_wq_size =
mca_btl_openib_param_register_int("ib_wq_size",
500);
mca_btl_openib_component.ib_sg_list_size =
mca_btl_openib_param_register_int("ib_sg_list_size",
1);
mca_btl_openib_component.ib_pkey_ix =
mca_btl_openib_param_register_int("ib_pkey_ix",
0);
mca_btl_openib_component.ib_psn =
mca_btl_openib_param_register_int("ib_psn",
0);
mca_btl_openib_component.ib_qp_ous_rd_atom =
mca_btl_openib_param_register_int("ib_qp_ous_rd_atom",
1);
mca_btl_openib_component.ib_mtu =
mca_btl_openib_param_register_int("ib_mtu",
IBV_MTU_1024);
mca_btl_openib_component.ib_min_rnr_timer =
mca_btl_openib_param_register_int("ib_min_rnr_timer",
5);
mca_btl_openib_component.ib_timeout =
mca_btl_openib_param_register_int("ib_timeout",
10);
mca_btl_openib_component.ib_retry_count =
mca_btl_openib_param_register_int("ib_retry_count",
7);
mca_btl_openib_component.ib_rnr_retry =
mca_btl_openib_param_register_int("ib_rnr_retry",
7);
mca_btl_openib_component.ib_max_rdma_dst_ops =
mca_btl_openib_param_register_int("ib_max_rdma_dst_ops",
16);
mca_btl_openib_component.ib_service_level =
mca_btl_openib_param_register_int("ib_service_level",
0);
mca_btl_openib_component.ib_static_rate =
mca_btl_openib_param_register_int("ib_static_rate",
0);
mca_btl_openib_component.ib_src_path_bits =
mca_btl_openib_param_register_int("ib_src_path_bits",
0);
mca_btl_openib_module.super.btl_exclusivity = mca_btl_openib_module.super.btl_exclusivity =
mca_btl_openib_param_register_int ("exclusivity", 0); mca_btl_openib_param_register_int ("exclusivity", 0);
mca_btl_openib_module.super.btl_eager_limit = mca_btl_openib_module.super.btl_eager_limit =
@ -145,55 +196,6 @@ int mca_btl_openib_component_open(void)
mca_btl_openib_module.super.btl_max_send_size = mca_btl_openib_module.super.btl_max_send_size =
mca_btl_openib_param_register_int ("max_send_size", (128*1024)) mca_btl_openib_param_register_int ("max_send_size", (128*1024))
- sizeof(mca_btl_openib_header_t); - sizeof(mca_btl_openib_header_t);
mca_btl_openib_module.ib_pin_min =
mca_btl_openib_param_register_int("ib_pin_min", 128*1024);
mca_btl_openib_module.ib_cq_size =
mca_btl_openib_param_register_int("ib_cq_size",
40000);
mca_btl_openib_module.ib_wq_size =
mca_btl_openib_param_register_int("ib_wq_size",
10000);
mca_btl_openib_module.ib_sg_list_size =
mca_btl_openib_param_register_int("ib_sg_list_size",
1);
mca_btl_openib_module.ib_pkey_ix =
mca_btl_openib_param_register_int("ib_pkey_ix",
0);
mca_btl_openib_module.ib_psn =
mca_btl_openib_param_register_int("ib_psn",
0);
mca_btl_openib_module.ib_qp_ous_rd_atom =
mca_btl_openib_param_register_int("ib_qp_ous_rd_atom",
1);
mca_btl_openib_module.ib_mtu =
mca_btl_openib_param_register_int("ib_mtu",
IBV_MTU_1024);
mca_btl_openib_module.ib_min_rnr_timer =
mca_btl_openib_param_register_int("ib_min_rnr_timer",
5);
mca_btl_openib_module.ib_timeout =
mca_btl_openib_param_register_int("ib_timeout",
10);
mca_btl_openib_module.ib_retry_count =
mca_btl_openib_param_register_int("ib_retry_count",
7);
mca_btl_openib_module.ib_rnr_retry =
mca_btl_openib_param_register_int("ib_rnr_retry",
7);
mca_btl_openib_module.ib_max_rdma_dst_ops =
mca_btl_openib_param_register_int("ib_max_rdma_dst_ops",
16);
mca_btl_openib_module.ib_service_level =
mca_btl_openib_param_register_int("ib_service_level",
0);
mca_btl_openib_module.ib_static_rate =
mca_btl_openib_param_register_int("ib_static_rate",
0);
mca_btl_openib_module.ib_src_path_bits =
mca_btl_openib_param_register_int("ib_src_path_bits",
0);
mca_btl_openib_module.super.btl_min_rdma_size = mca_btl_openib_module.super.btl_min_rdma_size =
mca_btl_openib_param_register_int("min_rdma_size", mca_btl_openib_param_register_int("min_rdma_size",
1024*1024); 1024*1024);
@ -296,16 +298,50 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
for(i = 0; i < num_devs; i++){ for(i = 0; i < num_devs; i++){
struct ibv_device_attr ib_dev_attr; struct ibv_device_attr ib_dev_attr;
struct ibv_context* ib_dev_context; struct ibv_context* ib_dev_context;
struct ibv_pd *my_pd;
struct ibv_mr *mr;
void* my_addr;
uint32_t my_size;
uint32_t my_indx;
uint32_t my_mult;
my_mult = 4096;
ib_dev = ib_devs[i]; ib_dev = ib_devs[i];
ib_dev_context = ibv_open_device(ib_dev); ib_dev_context = ibv_open_device(ib_dev);
if(!ib_dev_context) { if(!ib_dev_context) {
opal_output(0, "%s: error obtaining device context for %s\n", __func__, ibv_get_device_name(ib_dev)); opal_output(0, "%s: error obtaining device context for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno));
return NULL; return NULL;
} }
my_pd = ibv_alloc_pd(ib_dev_context);
for(my_indx = 1; my_indx <= 8192; my_indx++){
my_size = my_mult * my_indx;
my_addr = memalign(4096, my_size);
memset(my_addr, 0, my_size);
mr = ibv_reg_mr(
my_pd,
my_addr,
my_size,
IBV_ACCESS_REMOTE_WRITE
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
);
if(NULL == mr){
opal_output(0, "%s: error on mr test! can't register %lu bytes, errno says %s \n", __func__, my_size, strerror(errno));
break;
}
else {
opal_output(0, "%s: successfully registerted %lu bytes", __func__, my_size);
ibv_dereg_mr(mr);
}
}
if(ibv_query_device(ib_dev_context, &ib_dev_attr)){ if(ibv_query_device(ib_dev_context, &ib_dev_attr)){
opal_output(0, "%s: error obtaining device attributes for %s\n", __func__, ibv_get_device_name(ib_dev)); opal_output(0, "%s: error obtaining device attributes for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno));
return NULL; return NULL;
} }
@ -316,8 +352,8 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
struct ibv_port_attr* ib_port_attr; struct ibv_port_attr* ib_port_attr;
ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr)); ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr));
if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){ if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){
opal_output(0, "%s: error getting port attributes for device %s port number %d", opal_output(0, "%s: error getting port attributes for device %s port number %d errno says %s",
__func__, ibv_get_device_name(ib_dev), j); __func__, ibv_get_device_name(ib_dev), j, strerror(errno));
return NULL; return NULL;
} }
@ -337,6 +373,9 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
mca_btl_openib_component.ib_num_btls ++; mca_btl_openib_component.ib_num_btls ++;
} }
else{
free(ib_port_attr);
}
} }
} }
@ -501,7 +540,7 @@ int mca_btl_openib_component_progress()
do{ do{
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc ); ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
if(ne < 0 ){ if(ne < 0 ){
opal_output(0, "%s: error polling CQ with %d \n", __func__, ne); opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
else if(wc.status != IBV_WC_SUCCESS) { else if(wc.status != IBV_WC_SUCCESS) {
@ -562,7 +601,7 @@ int mca_btl_openib_component_progress()
ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc ); ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc );
if(ne < 0){ if(ne < 0){
opal_output(0, "%s: error polling CQ with %d \n", __func__, ne); opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
else if(wc.status != IBV_WC_SUCCESS) { else if(wc.status != IBV_WC_SUCCESS) {

Просмотреть файл

@ -30,6 +30,9 @@
#include "btl_openib_proc.h" #include "btl_openib_proc.h"
#include "btl_openib_frag.h" #include "btl_openib_frag.h"
#include "class/ompi_free_list.h" #include "class/ompi_free_list.h"
#include <errno.h>
#include <string.h>
extern int errno;
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint); static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
@ -82,7 +85,7 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
if(ibv_post_send(ib_qp, if(ibv_post_send(ib_qp,
&frag->sr_desc, &frag->sr_desc,
&bad_wr)) { &bad_wr)) {
opal_output(0, "%s: error posting send request\n", __func__); opal_output(0, "%s: error posting send request errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
mca_btl_openib_endpoint_post_rr(endpoint, 1); mca_btl_openib_endpoint_post_rr(endpoint, 1);
@ -686,17 +689,17 @@ int mca_btl_openib_endpoint_create_qp(
struct ibv_qp_init_attr qp_init_attr; struct ibv_qp_init_attr qp_init_attr;
qp_init_attr.send_cq = cq; qp_init_attr.send_cq = cq;
qp_init_attr.recv_cq = cq; qp_init_attr.recv_cq = cq;
qp_init_attr.cap.max_send_wr = openib_btl->ib_wq_size; qp_init_attr.cap.max_send_wr = mca_btl_openib_component.ib_wq_size;
qp_init_attr.cap.max_recv_wr = openib_btl->ib_wq_size; qp_init_attr.cap.max_recv_wr = mca_btl_openib_component.ib_wq_size;
qp_init_attr.cap.max_send_sge = openib_btl->ib_sg_list_size; qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.cap.max_recv_sge = openib_btl->ib_sg_list_size; qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.qp_type = IBV_QPT_RC; qp_init_attr.qp_type = IBV_QPT_RC;
(*qp) = ibv_create_qp(pd, &qp_init_attr); (*qp) = ibv_create_qp(pd, &qp_init_attr);
if(NULL == (*qp)) { if(NULL == (*qp)) {
opal_output(0, "%s: error creating qp \n", __func__); opal_output(0, "%s: error creating qp errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
@ -706,7 +709,7 @@ int mca_btl_openib_endpoint_create_qp(
{ {
qp_attr->qp_state = IBV_QPS_INIT; qp_attr->qp_state = IBV_QPS_INIT;
qp_attr->pkey_index = openib_btl->ib_pkey_ix; qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix;
qp_attr->port_num = openib_btl->port_num; qp_attr->port_num = openib_btl->port_num;
qp_attr->qp_access_flags = 0; qp_attr->qp_access_flags = 0;
@ -715,7 +718,7 @@ int mca_btl_openib_endpoint_create_qp(
IBV_QP_PKEY_INDEX | IBV_QP_PKEY_INDEX |
IBV_QP_PORT | IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS )) { IBV_QP_ACCESS_FLAGS )) {
opal_output(0, "%s: error modifying qp to INIT\n"); opal_output(0, "%s: error modifying qp to INIT errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
} }
@ -737,15 +740,15 @@ int mca_btl_openib_endpoint_qp_init_query(
{ {
attr->qp_state = IBV_QPS_RTR; attr->qp_state = IBV_QPS_RTR;
attr->path_mtu = openib_btl->ib_mtu; attr->path_mtu = mca_btl_openib_component.ib_mtu;
attr->dest_qp_num = rem_qp_num; attr->dest_qp_num = rem_qp_num;
attr->rq_psn = rem_psn; attr->rq_psn = rem_psn;
attr->max_dest_rd_atomic = openib_btl->ib_max_rdma_dst_ops; attr->max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
attr->min_rnr_timer = openib_btl->ib_min_rnr_timer; attr->min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
attr->ah_attr.is_global = 0; attr->ah_attr.is_global = 0;
attr->ah_attr.dlid = rem_lid; attr->ah_attr.dlid = rem_lid;
attr->ah_attr.sl = openib_btl->ib_service_level; attr->ah_attr.sl = mca_btl_openib_component.ib_service_level;
attr->ah_attr.src_path_bits = openib_btl->ib_src_path_bits; attr->ah_attr.src_path_bits = mca_btl_openib_component.ib_src_path_bits;
attr->ah_attr.port_num = port_num; attr->ah_attr.port_num = port_num;
if(ibv_modify_qp(qp, attr, if(ibv_modify_qp(qp, attr,
@ -756,15 +759,15 @@ int mca_btl_openib_endpoint_qp_init_query(
IBV_QP_RQ_PSN | IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER)) { IBV_QP_MIN_RNR_TIMER)) {
opal_output(0, "%s: error modifing QP to RTR\n", __func__); opal_output(0, "%s: error modifing QP to RTR errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
attr->qp_state = IBV_QPS_RTS; attr->qp_state = IBV_QPS_RTS;
attr->timeout = openib_btl->ib_timeout; attr->timeout = mca_btl_openib_component.ib_timeout;
attr->retry_cnt = openib_btl->ib_retry_count; attr->retry_cnt = mca_btl_openib_component.ib_retry_count;
attr->rnr_retry = openib_btl->ib_rnr_retry; attr->rnr_retry = mca_btl_openib_component.ib_rnr_retry;
attr->sq_psn = lcl_psn; attr->sq_psn = lcl_psn;
attr->max_rd_atomic = openib_btl->ib_max_rdma_dst_ops; attr->max_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
if (ibv_modify_qp(qp, attr, if (ibv_modify_qp(qp, attr,
IBV_QP_STATE | IBV_QP_STATE |
IBV_QP_TIMEOUT | IBV_QP_TIMEOUT |
@ -772,7 +775,7 @@ int mca_btl_openib_endpoint_qp_init_query(
IBV_QP_RNR_RETRY | IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN | IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC)) { IBV_QP_MAX_QP_RD_ATOMIC)) {
opal_output(0, "%s: error modifying QP to RTS\n", __func__); opal_output(0, "%s: error modifying QP to RTS errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;

Просмотреть файл

@ -23,6 +23,11 @@
#include "mca/btl/btl.h" #include "mca/btl/btl.h"
#include "btl_openib_frag.h" #include "btl_openib_frag.h"
#include "btl_openib.h" #include "btl_openib.h"
#include <errno.h>
#include <string.h>
extern int errno;
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
extern "C" { extern "C" {
#endif #endif
@ -159,7 +164,7 @@ static inline int mca_btl_openib_endpoint_post_rr_sub(int cnt,
if(ibv_post_recv(qp, if(ibv_post_recv(qp,
&rr_desc_post[i], &rr_desc_post[i],
&bad_wr)) { &bad_wr)) {
opal_output(0, "%s: error posting receive\n", __func__); opal_output(0, "%s: error posting receive errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }

Просмотреть файл

@ -23,7 +23,6 @@
#include "class/ompi_free_list.h" #include "class/ompi_free_list.h"
#include "opal/event/event.h" #include "opal/event/event.h"
#include "mca/mpool/mpool.h" #include "mca/mpool/mpool.h"
#include "mca/allocator/allocator.h"
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
extern "C" { extern "C" {
@ -42,7 +41,6 @@ static inline void* ALIGN_ADDR(void* addr, uint32_t cnt ) {
struct mca_mpool_openib_component_t { struct mca_mpool_openib_component_t {
mca_mpool_base_component_t super; mca_mpool_base_component_t super;
char* vapi_allocator_name;
long page_size; long page_size;
long page_size_log; long page_size_log;
}; };
@ -62,7 +60,6 @@ typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t;
struct mca_mpool_openib_module_t { struct mca_mpool_openib_module_t {
mca_mpool_base_module_t super; mca_mpool_base_module_t super;
mca_allocator_base_module_t * vapi_allocator;
struct mca_mpool_base_resources_t resources; struct mca_mpool_base_resources_t resources;
}; typedef struct mca_mpool_openib_module_t mca_mpool_openib_module_t; }; typedef struct mca_mpool_openib_module_t mca_mpool_openib_module_t;
@ -131,11 +128,6 @@ void mca_mpool_openib_free(mca_mpool_base_module_t* mpool,
void * addr, void * addr,
mca_mpool_base_registration_t* registration); mca_mpool_base_registration_t* registration);
void* mca_common_vapi_segment_alloc(
struct mca_mpool_base_module_t* module,
size_t* size,
mca_mpool_base_registration_t** registration);
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
} }
#endif #endif

Просмотреть файл

@ -18,11 +18,11 @@
#include "opal/util/output.h" #include "opal/util/output.h"
#include "mca/base/base.h" #include "mca/base/base.h"
#include "mca/base/mca_base_param.h" #include "mca/base/mca_base_param.h"
#include "mca/allocator/base/base.h"
#include "mpool_openib.h" #include "mpool_openib.h"
#include "util/proc_info.h" #include "util/proc_info.h"
#include "util/sys_info.h" #include "util/sys_info.h"
#include <unistd.h> #include <unistd.h>
#include <malloc.h>
/* /*
* Local functions * Local functions
@ -99,7 +99,7 @@ static char* mca_mpool_openib_param_register_string(
const char* default_value) const char* default_value)
{ {
char *param_value; char *param_value;
int id = mca_base_param_register_string("mpool","vapi",param_name,NULL,default_value); int id = mca_base_param_register_string("mpool","openib",param_name,NULL,default_value);
mca_base_param_lookup_string(id, &param_value); mca_base_param_lookup_string(id, &param_value);
return param_value; return param_value;
} }
@ -110,37 +110,16 @@ static char* mca_mpool_openib_param_register_string(
*/ */
static int mca_mpool_openib_open(void) static int mca_mpool_openib_open(void)
{ {
/* register VAPI component parameters */
mca_mpool_openib_component.vapi_allocator_name =
mca_mpool_openib_param_register_string("allocator", "bucket");
/* get the page size for this architecture*/ /* get the page size for this architecture*/
mca_mpool_openib_component.page_size = sysconf(_SC_PAGESIZE); mca_mpool_openib_component.page_size = sysconf(_SC_PAGESIZE);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* Allocates a segment of memory and registers with IB, user_out returns the memory handle. */
void* mca_common_vapi_segment_alloc(
struct mca_mpool_base_module_t* mpool,
size_t* size,
mca_mpool_base_registration_t** registration)
{
void* addr_malloc = (void*)malloc((*size) + mca_mpool_openib_component.page_size);
void* addr = (void*) ALIGN_ADDR(addr_malloc, mca_mpool_openib_component.page_size_log);
if(OMPI_SUCCESS != mpool->mpool_register(mpool, addr, *size, registration)) {
free(addr_malloc);
return NULL;
}
return addr;
}
/* Allocates a segment of memory and registers with IB, user_out returns the memory handle. */
static mca_mpool_base_module_t* mca_mpool_openib_init( static mca_mpool_base_module_t* mca_mpool_openib_init(
struct mca_mpool_base_resources_t* resources) struct mca_mpool_base_resources_t* resources)
{ {
mca_mpool_openib_module_t* mpool_module; mca_mpool_openib_module_t* mpool_module;
mca_allocator_base_component_t* allocator_component;
long page_size = mca_mpool_openib_component.page_size; long page_size = mca_mpool_openib_component.page_size;
mca_mpool_openib_component.page_size_log = 0; mca_mpool_openib_component.page_size_log = 0;
@ -149,32 +128,12 @@ static mca_mpool_base_module_t* mca_mpool_openib_init(
mca_mpool_openib_component.page_size_log++; mca_mpool_openib_component.page_size_log++;
} }
/* if specified allocator cannout be loaded - look for an alternative */
allocator_component = mca_allocator_component_lookup(mca_mpool_openib_component.vapi_allocator_name);
if(NULL == allocator_component) {
if(opal_list_get_size(&mca_allocator_base_components) == 0) {
mca_base_component_list_item_t* item = (mca_base_component_list_item_t*)
opal_list_get_first(&mca_allocator_base_components);
allocator_component = (mca_allocator_base_component_t*)item->cli_component;
opal_output(0, "mca_mpool_openib_init: unable to locate allocator: %s - using %s\n",
mca_mpool_openib_component.vapi_allocator_name, allocator_component->allocator_version.mca_component_name);
} else {
opal_output(0, "mca_mpool_openib_init: unable to locate allocator: %s\n",
mca_mpool_openib_component.vapi_allocator_name);
return NULL;
}
}
mpool_module = (mca_mpool_openib_module_t*)malloc(sizeof(mca_mpool_openib_module_t)); mpool_module = (mca_mpool_openib_module_t*)malloc(sizeof(mca_mpool_openib_module_t));
mca_mpool_openib_module_init(mpool_module); mca_mpool_openib_module_init(mpool_module);
mpool_module->resources = *resources; mpool_module->resources = *resources;
mpool_module->vapi_allocator =
allocator_component->allocator_init(true, mca_common_vapi_segment_alloc, NULL, &mpool_module->super);
if(NULL == mpool_module->vapi_allocator) {
opal_output(0, "mca_mpool_openib_init: unable to initialize allocator");
return NULL;
}
return &mpool_module->super; return &mpool_module->super;
} }

Просмотреть файл

@ -19,7 +19,8 @@
#include "opal/util/output.h" #include "opal/util/output.h"
#include "mca/mpool/openib/mpool_openib.h" #include "mca/mpool/openib/mpool_openib.h"
#include <infiniband/verbs.h> #include <infiniband/verbs.h>
#include <errno.h>
#include <string.h>
/* /*
* Initializes the mpool module. * Initializes the mpool module.
*/ */
@ -46,9 +47,18 @@ void* mca_mpool_openib_alloc(
mca_mpool_base_registration_t** registration) mca_mpool_base_registration_t** registration)
{ {
mca_mpool_openib_module_t* mpool_openib = (mca_mpool_openib_module_t*)mpool; mca_mpool_openib_module_t* mpool_openib = (mca_mpool_openib_module_t*)mpool;
return mpool_openib->vapi_allocator->alc_alloc(mpool_openib->vapi_allocator, size, align, registration); /* void* addr_malloc = (void*)malloc((*size) + mca_mpool_openib_component.page_size); */
} /* void* addr = (void*) ALIGN_ADDR(addr_malloc, mca_mpool_openib_component.page_size_log); */
void* addr_malloc = (void*)memalign(mca_mpool_openib_component.page_size, size);
void* addr = addr_malloc;
if(OMPI_SUCCESS != mpool->mpool_register(mpool, addr, size, registration)) {
free(addr_malloc);
return NULL;
}
return addr;
}
/* /*
* register memory * register memory
@ -57,6 +67,7 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
void *addr, void *addr,
size_t size, size_t size,
mca_mpool_base_registration_t** registration){ mca_mpool_base_registration_t** registration){
mca_mpool_openib_module_t * mpool_module = (mca_mpool_openib_module_t*) mpool; mca_mpool_openib_module_t * mpool_module = (mca_mpool_openib_module_t*) mpool;
mca_mpool_openib_registration_t * vapi_reg; mca_mpool_openib_registration_t * vapi_reg;
@ -69,12 +80,13 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
mpool_module->resources.ib_pd, mpool_module->resources.ib_pd,
addr, addr,
size, size,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE IBV_ACCESS_REMOTE_WRITE
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
); );
if(NULL == vapi_reg->mr){ if(NULL == vapi_reg->mr){
opal_output(0, "%s: error registering openib memory\n", __func__); opal_output(0, "%s: error registering openib memory of size %lu errno says %s\n", __func__, size, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
@ -95,10 +107,10 @@ int mca_mpool_openib_deregister(mca_mpool_base_module_t* mpool, void *addr, size
mca_mpool_openib_registration_t * openib_reg; mca_mpool_openib_registration_t * openib_reg;
openib_reg = (mca_mpool_openib_registration_t*) registration; openib_reg = (mca_mpool_openib_registration_t*) registration;
if(! ibv_dereg_mr(openib_reg->mr)){ if(! ibv_dereg_mr(openib_reg->mr)){
opal_output(0, "%s: error unpinning openib memory\n", __func__); opal_output(0, "%s: error unpinning openib memory errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
free(registration);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }