1
1

Move some of the things I did in r18762 to the openib BTL proper (in

endpoint.c) because it's almost identical in all the CPC's.  OOB,
XOOB, and IBCM now all invoke the btl error handler properly if
there's an error during wireup.  RDMACM still needs to be done.

This commit was SVN r18764.

The following SVN revision numbers were found above:
  r18762 --> open-mpi/ompi@3eda04578f
Этот коммит содержится в:
Jeff Squyres 2008-06-27 22:48:45 +00:00
родитель 3eda04578f
Коммит 67933e743c
5 изменённых файлов: 80 добавлений и 62 удалений

Просмотреть файл

@ -924,3 +924,39 @@ unlock_rdma_local:
endpoint->eager_rdma_local.base.pval, NULL);
endpoint->eager_rdma_local.frags = NULL;
}
/*
* Invoke an error on the btl associated with an endpoint. If we
* don't have an endpoint, then just use the first one on the
* component list of BTLs.
*/
void *mca_btl_openib_endpoint_invoke_error(void *context)
{
mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t*) context;
mca_btl_openib_module_t *btl;
if (NULL == endpoint) {
int i;
for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i) {
if (NULL != mca_btl_openib_component.openib_btls[i]) {
btl = mca_btl_openib_component.openib_btls[i];
break;
}
}
} else {
btl = endpoint->endpoint_btl;
}
/* If we didn't find a BTL, then just bail :-( */
if (NULL == btl) {
orte_show_help("help-mpi-btl-openib.txt",
"cannot raise btl error", orte_process_info.nodename);
exit(1);
}
/* Invoke the callback to the upper layer */
btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL);
/* Will likely never get here */
return NULL;
}

Просмотреть файл

@ -251,6 +251,13 @@ void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*,
struct mca_btl_openib_proc_modex_t *remote_proc_info,
ompi_btl_openib_connect_base_module_data_t *remote_cpc_data);
/*
* Invoke an error on the btl associated with an endpoint. If we
* don't have an endpoint, then just use the first one on the
* component list of BTLs.
*/
void *mca_btl_openib_endpoint_invoke_error(void *endpoint);
static inline int post_recvs(mca_btl_base_endpoint_t *ep, const int qp,
const int num_post)
{

Просмотреть файл

@ -525,8 +525,6 @@ static uint32_t ibcm_pid;
static opal_list_t ibcm_cm_listeners;
static opal_list_t ibcm_pending_requests;
static opal_list_t ibcm_pending_replies;
#define MAX_LAST_RESORT_BTL 10
mca_btl_openib_module_t *last_resort_btl[MAX_LAST_RESORT_BTL];
/*******************************************************************
* Component
@ -569,7 +567,7 @@ static void ibcm_component_register(void)
static int ibcm_component_query(mca_btl_openib_module_t *btl,
ompi_btl_openib_connect_base_module_t **cpc)
{
int rc, i;
int rc;
modex_msg_t *msg;
ibcm_module_t *m = NULL;
opal_list_item_t *item;
@ -602,19 +600,9 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
OBJ_CONSTRUCT(&ibcm_cm_listeners, opal_list_t);
OBJ_CONSTRUCT(&ibcm_pending_requests, opal_list_t);
OBJ_CONSTRUCT(&ibcm_pending_replies, opal_list_t);
memset(last_resort_btl, 0, sizeof(last_resort_btl));
initialized = true;
}
/* Cache this for some situations where we can't find a BTL. If
it's already full, that's fine. */
for (i = 0; i < MAX_LAST_RESORT_BTL; ++i) {
if (NULL == last_resort_btl[i]) {
last_resort_btl[i] = NULL;
break;
}
}
/* Allocate the module struct. Use calloc so that it's safe to
finalize the module if something goes wrong. */
m = calloc(1, sizeof(*m) + sizeof(*msg));
@ -1465,17 +1453,8 @@ static int ibcm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
static int ibcm_module_finalize(mca_btl_openib_module_t *btl,
ompi_btl_openib_connect_base_module_t *cpc)
{
int i;
ibcm_module_t *m = (ibcm_module_t *) cpc;
/* Remove this BTL from the last_resort_btl array */
for (i = 0; i < MAX_LAST_RESORT_BTL; ++i) {
if (btl == last_resort_btl[i]) {
last_resort_btl[i] = NULL;
break;
}
}
/* If we previously successfully initialized, then destroy
everything */
if (NULL != m && NULL != m->cmh) {
@ -1636,40 +1615,6 @@ static void *callback_start_connect(void *context)
return NULL;
}
/*
* Callback (from main thread) when the endpoint connection has failed
*/
static void *callback_connection_failed(void *context)
{
mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t*) context;
mca_btl_openib_module_t *btl = NULL;
if (NULL != endpoint) {
btl = endpoint->endpoint_btl;
} else {
/* If we don't have/couldn't find a matching BTL to raise an
error, then raise an error on *any* BTL that we can find. */
int i;
for (i = 0; i < MAX_LAST_RESORT_BTL; ++i) {
if (NULL != last_resort_btl[i]) {
btl = last_resort_btl[i];
}
}
}
/* If we didn't find a BTL, then just bail :-( */
if (NULL == btl) {
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
"cannot raise btl error", orte_process_info.nodename);
exit(1);
}
/* Invoke the callback to the upper layer */
btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL);
return NULL;
}
/*
* Passive has received a connection request from a active
*/
@ -1951,7 +1896,8 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
/* Communicate to the upper layer that the connection on this
endpoint has failed */
ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint);
ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error,
endpoint);
return rc;
}
@ -2067,7 +2013,8 @@ static int reply_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
error:
/* Communicate to the upper layer that the connection on this
endpoint has failed */
ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint);
ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error,
endpoint);
return rc;
}
@ -2113,7 +2060,8 @@ static int ready_to_use_received(ibcm_listen_cm_id_t *h,
error:
/* Communicate to the upper layer that the connection on this
endpoint has failed */
ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint);
ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error,
endpoint);
return rc;
}
@ -2198,7 +2146,7 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
reason));
/* Communicate to the upper layer that the connection on this
endpoint has failed */
ompi_btl_openib_fd_schedule(callback_connection_failed, NULL);
ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error, NULL);
return OMPI_ERR_NOT_FOUND;
}
@ -2230,7 +2178,8 @@ static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
/* Communicate to the upper layer that the connection on this
endpoint has failed */
ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint);
ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error,
endpoint);
return OMPI_SUCCESS;
}
@ -2261,7 +2210,8 @@ static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
/* Communicate to the upper layer that the connection on this
endpoint has failed */
ompi_btl_openib_fd_schedule(callback_connection_failed, endpoint);
ompi_btl_openib_fd_schedule(mca_btl_openib_endpoint_invoke_error,
endpoint);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -635,6 +635,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
rc = opal_dss.unpack(buffer, &message_type, &cnt, OPAL_UINT8);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
@ -642,6 +643,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
rc = opal_dss.unpack(buffer, &rem_info.rem_subnet_id, &cnt, OPAL_UINT64);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
@ -650,12 +652,14 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
rc = opal_dss.unpack(buffer, &lcl_qp, &cnt, OPAL_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16));
rc = opal_dss.unpack(buffer, &lcl_lid, &cnt, OPAL_UINT16);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
}
@ -673,6 +677,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
OPAL_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
@ -680,6 +685,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
OPAL_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
}
@ -688,18 +694,21 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
rc = opal_dss.unpack(buffer, &rem_info.rem_lid, &cnt, OPAL_UINT16);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
rc = opal_dss.unpack(buffer, &rem_info.rem_mtu, &cnt, OPAL_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
rc = opal_dss.unpack(buffer, &rem_info.rem_index, &cnt, OPAL_UINT32);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
}
@ -769,6 +778,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
if (!found) {
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
@ -791,6 +801,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
break;
}
@ -805,6 +816,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
set_remote_info(ib_endpoint, &rem_info);
if (OMPI_SUCCESS != (rc = qp_connect_all(ib_endpoint))) {
BTL_ERROR(("endpoint connect error: %d", rc));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
break;
}
@ -836,6 +848,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
default :
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
}
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;

Просмотреть файл

@ -786,6 +786,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
/* Get data. */
if ( OMPI_SUCCESS != xoob_receive_connect_data(&rem_info, &requested_lid, &message_type, buffer)) {
BTL_ERROR(("Failed to read data\n"));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
@ -802,18 +803,21 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_REQUEST."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,requested_lid));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
/* prepost data on receiver site */
if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) {
BTL_ERROR(("Failed to post on XRC SRQs"));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
/* we should create qp and send the info + srq to requestor */
rc = xoob_reply_first_connect(ib_endpoint, &rem_info);
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
/* enable pooling for this btl */
@ -830,17 +834,20 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_REQUEST."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,requested_lid));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
if (OMPI_SUCCESS == xoob_recv_qp_connect(ib_endpoint, &rem_info)) {
if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) {
BTL_ERROR(("Failed to post on XRC SRQs"));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_RESPONSE);
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
return;
}
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
@ -850,6 +857,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE);
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
return;
}
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
@ -866,6 +874,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_RESPONSE."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,rem_info.rem_lid));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
@ -880,6 +889,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
ib_endpoint->rem_info.rem_lid,ib_endpoint->rem_info.rem_subnet_id));
if (OMPI_SUCCESS != xoob_send_qp_connect(ib_endpoint, &rem_info)) {
BTL_ERROR(("Failed to connect endpoint\n"));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
mca_btl_openib_endpoint_connected(ib_endpoint);
@ -895,6 +905,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_RESPONSE."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,rem_info.rem_lid));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
@ -915,6 +926,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE."
" Failed to find endpoint with subnet %d and LID %d",
rem_info.rem_subnet_id,rem_info.rem_lid));
mca_btl_openib_endpoint_invoke_error(NULL);
return;
}
xoob_restart_connect(ib_endpoint);