diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index fa0c11299d..ee935cab8f 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -924,3 +924,39 @@ unlock_rdma_local: endpoint->eager_rdma_local.base.pval, NULL); endpoint->eager_rdma_local.frags = NULL; } + +/* + * Invoke an error on the btl associated with an endpoint. If we + * don't have an endpoint, then just use the first one on the + * component list of BTLs. + */ +void *mca_btl_openib_endpoint_invoke_error(void *context) +{ + mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t*) context; + mca_btl_openib_module_t *btl; + + if (NULL == endpoint) { + int i; + for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i) { + if (NULL != mca_btl_openib_component.openib_btls[i]) { + btl = mca_btl_openib_component.openib_btls[i]; + break; + } + } + } else { + btl = endpoint->endpoint_btl; + } + + /* If we didn't find a BTL, then just bail :-( */ + if (NULL == btl) { + orte_show_help("help-mpi-btl-openib.txt", + "cannot raise btl error", orte_process_info.nodename); + exit(1); + } + + /* Invoke the callback to the upper layer */ + btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL); + + /* Will likely never get here */ + return NULL; +} diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 4e4b0500a4..bb4761de57 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -251,6 +251,13 @@ void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*, struct mca_btl_openib_proc_modex_t *remote_proc_info, ompi_btl_openib_connect_base_module_data_t *remote_cpc_data); +/* + * Invoke an error on the btl associated with an endpoint. If we + * don't have an endpoint, then just use the first one on the + * component list of BTLs. + */ +void *mca_btl_openib_endpoint_invoke_error(void *endpoint); + static inline int post_recvs(mca_btl_base_endpoint_t *ep, const int qp, const int num_post) { diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c index a5f8949f4a..5bdabeac28 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c @@ -525,8 +525,6 @@ static uint32_t ibcm_pid; static opal_list_t ibcm_cm_listeners; static opal_list_t ibcm_pending_requests; static opal_list_t ibcm_pending_replies; -#define MAX_LAST_RESORT_BTL 10 -mca_btl_openib_module_t *last_resort_btl[MAX_LAST_RESORT_BTL]; /******************************************************************* * Component @@ -569,7 +567,7 @@ static void ibcm_component_register(void) static int ibcm_component_query(mca_btl_openib_module_t *btl, ompi_btl_openib_connect_base_module_t **cpc) { - int rc, i; + int rc; modex_msg_t *msg; ibcm_module_t *m = NULL; opal_list_item_t *item; @@ -602,19 +600,9 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl, OBJ_CONSTRUCT(&ibcm_cm_listeners, opal_list_t); OBJ_CONSTRUCT(&ibcm_pending_requests, opal_list_t); OBJ_CONSTRUCT(&ibcm_pending_replies, opal_list_t); - memset(last_resort_btl, 0, sizeof(last_resort_btl)); initialized = true; } - /* Cache this for some situations where we can't find a BTL. If - it's already full, that's fine. */ - for (i = 0; i < MAX_LAST_RESORT_BTL; ++i) { - if (NULL == last_resort_btl[i]) { - last_resort_btl[i] = NULL; - break; - } - } - /* Allocate the module struct. Use calloc so that it's safe to finalize the module if something goes wrong. */ m = calloc(1, sizeof(*m) + sizeof(*msg)); @@ -1465,17 +1453,8 @@ static int ibcm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint) static int ibcm_module_finalize(mca_btl_openib_module_t *btl, ompi_btl_openib_connect_base_module_t *cpc) { - int i; ibcm_module_t *m = (ibcm_module_t *) cpc; - /* Remove this BTL from the last_resort_btl array */ - for (i = 0; i < MAX_LAST_RESORT_BTL; ++i) { - if (btl == last_resort_btl[i]) { - last_resort_btl[i] = NULL; - break; - } - } - /* If we previously successfully initialized, then destroy everything */ if (NULL != m && NULL != m->cmh) { @@ -1636,40 +1615,6 @@ static void *callback_start_connect(void *context) return NULL; } -/* - * Callback (from main thread) when the endpoint connection has failed - */ -static void *callback_connection_failed(void *context) -{ - mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t*) context; - mca_btl_openib_module_t *btl = NULL; - - if (NULL != endpoint) { - btl = endpoint->endpoint_btl; - } else { - /* If we don't have/couldn't find a matching BTL to raise an - error, then raise an error on *any* BTL that we can find. */ - int i; - for (i = 0; i < MAX_LAST_RESORT_BTL; ++i) { - if (NULL != last_resort_btl[i]) { - btl = last_resort_btl[i]; - } - } - } - - /* If we didn't find a BTL, then just bail :-( */ - if (NULL == btl) { - orte_show_help("help-mpi-btl-openib-cpc-base.txt", - "cannot raise btl error", orte_process_info.nodename); - exit(1); - } - - /* Invoke the callback to the upper layer */ - btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL); - - return NULL; -} - /* * Passive has received a connection request from a active */ @@ -1951,7 +1896,8 @@ static int request_received(ibcm_listen_cm_id_t *cmh, /* Communicate to the upper layer that the connection on this endpoint has failed */ - ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint); + ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error, + endpoint); return rc; } @@ -2067,7 +2013,8 @@ static int reply_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event) error: /* Communicate to the upper layer that the connection on this endpoint has failed */ - ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint); + ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error, + endpoint); return rc; } @@ -2113,7 +2060,8 @@ static int ready_to_use_received(ibcm_listen_cm_id_t *h, error: /* Communicate to the upper layer that the connection on this endpoint has failed */ - ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint); + ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error, + endpoint); return rc; } @@ -2198,7 +2146,7 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event) reason)); /* Communicate to the upper layer that the connection on this endpoint has failed */ - ompi_btl_openib_fd_schedule(callback_connection_failed, NULL); + ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error, NULL); return OMPI_ERR_NOT_FOUND; } @@ -2230,7 +2178,8 @@ static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event) /* Communicate to the upper layer that the connection on this endpoint has failed */ - ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint); + ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error, + endpoint); return OMPI_SUCCESS; } @@ -2261,7 +2210,8 @@ static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event) /* Communicate to the upper layer that the connection on this endpoint has failed */ - ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint); + ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error, + endpoint); return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c index 27de8e8ff4..cc14f68987 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c @@ -635,6 +635,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, rc = opal_dss.unpack(buffer, &message_type, &cnt, OPAL_UINT8); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } @@ -642,6 +643,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, rc = opal_dss.unpack(buffer, &rem_info.rem_subnet_id, &cnt, OPAL_UINT64); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } @@ -650,12 +652,14 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, rc = opal_dss.unpack(buffer, &lcl_qp, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16)); rc = opal_dss.unpack(buffer, &lcl_lid, &cnt, OPAL_UINT16); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } } @@ -673,6 +677,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); @@ -680,6 +685,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } } @@ -688,18 +694,21 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, rc = opal_dss.unpack(buffer, &rem_info.rem_lid, &cnt, OPAL_UINT16); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); rc = opal_dss.unpack(buffer, &rem_info.rem_mtu, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); rc = opal_dss.unpack(buffer, &rem_info.rem_index, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + mca_btl_openib_endpoint_invoke_error(NULL); return; } } @@ -769,6 +778,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, if (!found) { BTL_ERROR(("can't find suitable endpoint for this peer\n")); + mca_btl_openib_endpoint_invoke_error(NULL); return; } @@ -791,6 +801,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); + mca_btl_openib_endpoint_invoke_error(ib_endpoint); break; } @@ -805,6 +816,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, set_remote_info(ib_endpoint, &rem_info); if (OMPI_SUCCESS != (rc = qp_connect_all(ib_endpoint))) { BTL_ERROR(("endpoint connect error: %d", rc)); + mca_btl_openib_endpoint_invoke_error(ib_endpoint); break; } @@ -836,6 +848,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name, default : BTL_ERROR(("Invalid endpoint state %d", endpoint_state)); + mca_btl_openib_endpoint_invoke_error(ib_endpoint); } OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); break; diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c index 6db068e005..e5a17cb7f5 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c @@ -786,6 +786,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, /* Get data. */ if ( OMPI_SUCCESS != xoob_receive_connect_data(&rem_info, &requested_lid, &message_type, buffer)) { BTL_ERROR(("Failed to read data\n")); + mca_btl_openib_endpoint_invoke_error(NULL); return; } @@ -802,18 +803,21 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_REQUEST." " Failed to find endpoint with subnet %d and LID %d", rem_info.rem_subnet_id,requested_lid)); + mca_btl_openib_endpoint_invoke_error(NULL); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); /* prepost data on receiver site */ if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) { BTL_ERROR(("Failed to post on XRC SRQs")); + mca_btl_openib_endpoint_invoke_error(NULL); return; } /* we should create qp and send the info + srq to requestor */ rc = xoob_reply_first_connect(ib_endpoint, &rem_info); if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); + mca_btl_openib_endpoint_invoke_error(NULL); return; } /* enable pooling for this btl */ @@ -830,17 +834,20 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_REQUEST." " Failed to find endpoint with subnet %d and LID %d", rem_info.rem_subnet_id,requested_lid)); + mca_btl_openib_endpoint_invoke_error(NULL); return; } if (OMPI_SUCCESS == xoob_recv_qp_connect(ib_endpoint, &rem_info)) { if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) { BTL_ERROR(("Failed to post on XRC SRQs")); + mca_btl_openib_endpoint_invoke_error(ib_endpoint); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_RESPONSE); if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); + mca_btl_openib_endpoint_invoke_error(ib_endpoint); return; } OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); @@ -850,6 +857,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE); if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); + mca_btl_openib_endpoint_invoke_error(ib_endpoint); return; } OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); @@ -866,6 +874,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_RESPONSE." " Failed to find endpoint with subnet %d and LID %d", rem_info.rem_subnet_id,rem_info.rem_lid)); + mca_btl_openib_endpoint_invoke_error(NULL); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); @@ -880,6 +889,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, ib_endpoint->rem_info.rem_lid,ib_endpoint->rem_info.rem_subnet_id)); if (OMPI_SUCCESS != xoob_send_qp_connect(ib_endpoint, &rem_info)) { BTL_ERROR(("Failed to connect endpoint\n")); + mca_btl_openib_endpoint_invoke_error(NULL); return; } mca_btl_openib_endpoint_connected(ib_endpoint); @@ -895,6 +905,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_RESPONSE." " Failed to find endpoint with subnet %d and LID %d", rem_info.rem_subnet_id,rem_info.rem_lid)); + mca_btl_openib_endpoint_invoke_error(NULL); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); @@ -915,6 +926,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name, BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE." " Failed to find endpoint with subnet %d and LID %d", rem_info.rem_subnet_id,rem_info.rem_lid)); + mca_btl_openib_endpoint_invoke_error(NULL); return; } xoob_restart_connect(ib_endpoint);