Fixing thread deadlock flow in openib btl (mpi-thread enabled mode)
This commit was SVN r21793.
Этот коммит содержится в:
родитель
51b2cfe40d
Коммит
31a88b149a
@ -650,6 +650,23 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
|
||||
event trigger. */
|
||||
opal_progress_event_users_decrement();
|
||||
|
||||
if(MCA_BTL_XRC_ENABLED) {
|
||||
while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) {
|
||||
ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep);
|
||||
ep = (mca_btl_openib_endpoint_t *)ep_item;
|
||||
if (OMPI_SUCCESS !=
|
||||
ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc,
|
||||
ep)) {
|
||||
BTL_ERROR(("Failed to connect pending endpoint\n"));
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
|
||||
}
|
||||
|
||||
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
/* Process pending packet on the endpoint */
|
||||
|
||||
/* While there are frags in the list, process them */
|
||||
while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) {
|
||||
frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags));
|
||||
@ -664,19 +681,6 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
|
||||
* state then we restart them here */
|
||||
mca_btl_openib_frag_progress_pending_put_get(endpoint,
|
||||
mca_btl_openib_component.rdma_qp);
|
||||
|
||||
if(MCA_BTL_XRC_ENABLED) {
|
||||
while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) {
|
||||
ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep);
|
||||
ep = (mca_btl_openib_endpoint_t *)ep_item;
|
||||
if (OMPI_SUCCESS !=
|
||||
ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc,
|
||||
ep)) {
|
||||
BTL_ERROR(("Failed to connect pending endpoint\n"));
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -810,6 +810,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
BTL_ERROR(("error in endpoint reply start connect"));
|
||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -818,6 +819,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
RML events. Note: we increment it once peer active
|
||||
connection. */
|
||||
opal_progress_event_users_increment();
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_CONNECTING :
|
||||
@ -825,6 +827,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (OMPI_SUCCESS != (rc = qp_connect_all(ib_endpoint))) {
|
||||
BTL_ERROR(("endpoint connect error: %d", rc));
|
||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -833,32 +836,37 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
|
||||
/* Send him an ACK */
|
||||
send_connect_data(ib_endpoint, ENDPOINT_CONNECT_RESPONSE);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
} else {
|
||||
send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK);
|
||||
/* Tell main BTL that we're done */
|
||||
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
|
||||
/* cpc complete unlock the endpoint */
|
||||
}
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_WAITING_ACK:
|
||||
/* Tell main BTL that we're done */
|
||||
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
|
||||
/* cpc complete unlock the endpoint */
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_CONNECT_ACK:
|
||||
send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK);
|
||||
/* Tell main BTL that we're done */
|
||||
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
|
||||
/* cpc complete unlock the endpoint */
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_CONNECTED:
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
|
||||
default :
|
||||
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
|
||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -816,6 +816,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) {
|
||||
BTL_ERROR(("Failed to post on XRC SRQs"));
|
||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
/* we should create qp and send the info + srq to requestor */
|
||||
@ -823,6 +824,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
BTL_ERROR(("error in endpoint reply start connect"));
|
||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
/* enable pooling for this btl */
|
||||
@ -853,6 +855,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
BTL_ERROR(("error in endpoint reply start connect"));
|
||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
@ -863,6 +866,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
BTL_ERROR(("error in endpoint reply start connect"));
|
||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
@ -897,10 +901,11 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (OMPI_SUCCESS != xoob_send_qp_connect(ib_endpoint, &rem_info)) {
|
||||
BTL_ERROR(("Failed to connect endpoint\n"));
|
||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
/* cpc complete unlock the endpoint */
|
||||
break;
|
||||
case ENDPOINT_XOOB_CONNECT_XRC_RESPONSE:
|
||||
BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_RESPONSE: lid %d, sid %" PRIx64 "\n",
|
||||
@ -919,7 +924,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
/* we got srq numbers on our request */
|
||||
XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info);
|
||||
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
/* cpc complete unlock the endpoint */
|
||||
break;
|
||||
case ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE:
|
||||
/* The XRC recv site already was destroyed so we need
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user