diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index e65ffe3f57..35f1a541bf 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -99,8 +99,8 @@ mca_btl_openib_module_t mca_btl_openib_module = { static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl); -static void show_init_error(const char *file, int line, - const char *func, const char *dev) +void mca_btl_openib_show_init_error(const char *file, int line, + const char *func, const char *dev) { if (ENOMEM == errno) { int ret; @@ -166,15 +166,16 @@ static int adjust_cq(mca_btl_openib_device_t *device, const int cq) 0); if (NULL == device->ib_cq[cq]) { - show_init_error(__FILE__, __LINE__, "ibv_create_cq", - ibv_get_device_name(device->ib_dev)); + mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq", + ibv_get_device_name(device->ib_dev)); return OMPI_ERROR; } #if OPAL_ENABLE_PROGRESS_THREADS == 1 if(ibv_req_notify_cq(device->ib_cq[cq], 0)) { - show_init_error(__FILE__, __LINE__, "ibv_req_notify_cq", - ibv_get_device_name(device->ib_dev)); + mca_btl_openib_show_init_error(__FILE__, __LINE__, + "ibv_req_notify_cq", + ibv_get_device_name(device->ib_dev)); return OMPI_ERROR; } @@ -236,8 +237,9 @@ static int create_srq(mca_btl_openib_module_t *openib_btl) ibv_create_srq(openib_btl->device->ib_pd, &attr); } if (NULL == openib_btl->qps[qp].u.srq_qp.srq) { - show_init_error(__FILE__, __LINE__, "ibv_create_srq", - ibv_get_device_name(openib_btl->device->ib_dev)); + mca_btl_openib_show_init_error(__FILE__, __LINE__, + "ibv_create_srq", + ibv_get_device_name(openib_btl->device->ib_dev)); return OMPI_ERROR; } } diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 3925438e26..8dc6cf9421 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -611,6 +611,13 @@ extern void mca_btl_openib_frag_progress_pending_put_get( extern int mca_btl_openib_ft_event(int state); +/** + * Show an error during init, particularly when running out of + * registered memory. + */ +void mca_btl_openib_show_init_error(const char *file, int line, + const char *func, const char *dev); + #define BTL_OPENIB_HP_CQ 0 #define BTL_OPENIB_LP_CQ 1 diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 611106e3d2..b0fdb5e770 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -910,7 +910,7 @@ OBJ_CLASS_INSTANCE(mca_btl_openib_device_t, opal_object_t, device_construct, static int prepare_device_for_use(mca_btl_openib_device_t *device) { mca_btl_openib_frag_init_data_t *init_data; - int qp, length; + int rc, qp, length; #if OPAL_HAVE_THREADS if(mca_btl_openib_component.use_async_event_thread) { @@ -985,16 +985,25 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) init_data->order = MCA_BTL_NO_ORDER; init_data->list = &device->send_free_control; - if(OMPI_SUCCESS != ompi_free_list_init_ex_new( - &device->send_free_control, + rc = ompi_free_list_init_ex_new(&device->send_free_control, sizeof(mca_btl_openib_send_control_frag_t), CACHE_LINE_SIZE, OBJ_CLASS(mca_btl_openib_send_control_frag_t), length, mca_btl_openib_component.buffer_alignment, mca_btl_openib_component.ib_free_list_num, -1, mca_btl_openib_component.ib_free_list_inc, device->mpool, mca_btl_openib_frag_init, - init_data)) { - return OMPI_ERROR; + init_data); + if (OMPI_SUCCESS != rc) { + /* If we're "out of memory", this usually means that we ran + out of registered memory, so show that error message */ + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { + errno = ENOMEM; + mca_btl_openib_show_init_error(__FILE__, __LINE__, + "ompi_free_list_init_ex_new", + ibv_get_device_name(device->ib_dev)); + } + return rc; } /* setup all the qps */ @@ -1010,7 +1019,7 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) init_data->order = qp; init_data->list = &device->qps[qp].send_free; - if(OMPI_SUCCESS != ompi_free_list_init_ex_new(init_data->list, + rc = ompi_free_list_init_ex_new(init_data->list, sizeof(mca_btl_openib_send_frag_t), CACHE_LINE_SIZE, OBJ_CLASS(mca_btl_openib_send_frag_t), length, mca_btl_openib_component.buffer_alignment, @@ -1018,7 +1027,18 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device) mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, device->mpool, mca_btl_openib_frag_init, - init_data)) { + init_data); + if (OMPI_SUCCESS != rc) { + /* If we're "out of memory", this usually means that we + ran out of registered memory, so show that error + message */ + if (OMPI_ERR_OUT_OF_RESOURCE == rc || + OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) { + errno = ENOMEM; + mca_btl_openib_show_init_error(__FILE__, __LINE__, + "ompi_free_list_init_ex_new", + ibv_get_device_name(device->ib_dev)); + } return OMPI_ERROR; }