1
1

Fixing thread deadlock flow in openib btl (mpi-thread enabled mode)

This commit was SVN r21793.
Этот коммит содержится в:
Pavel Shamis 2009-08-11 10:43:52 +00:00
родитель 51b2cfe40d
Коммит 31a88b149a
3 изменённых файлов: 33 добавлений и 16 удалений

Просмотреть файл

@ -650,6 +650,23 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
event trigger. */
opal_progress_event_users_decrement();
if(MCA_BTL_XRC_ENABLED) {
while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) {
ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep);
ep = (mca_btl_openib_endpoint_t *)ep_item;
if (OMPI_SUCCESS !=
ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc,
ep)) {
BTL_ERROR(("Failed to connect pending endpoint\n"));
}
}
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
}
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
/* Process pending packet on the endpoint */
/* While there are frags in the list, process them */
while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) {
frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags));
@ -664,19 +681,6 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
* state then we restart them here */
mca_btl_openib_frag_progress_pending_put_get(endpoint,
mca_btl_openib_component.rdma_qp);
if(MCA_BTL_XRC_ENABLED) {
while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) {
ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep);
ep = (mca_btl_openib_endpoint_t *)ep_item;
if (OMPI_SUCCESS !=
ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc,
ep)) {
BTL_ERROR(("Failed to connect pending endpoint\n"));
}
}
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
}
}
/*

Просмотреть файл

@ -810,6 +810,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
}
@ -818,6 +819,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
RML events. Note: we increment it once peer active
connection. */
opal_progress_event_users_increment();
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
case MCA_BTL_IB_CONNECTING :
@ -825,6 +827,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != (rc = qp_connect_all(ib_endpoint))) {
BTL_ERROR(("endpoint connect error: %d", rc));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
}
@ -833,32 +836,37 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
/* Send him an ACK */
send_connect_data(ib_endpoint, ENDPOINT_CONNECT_RESPONSE);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
} else {
send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK);
/* Tell main BTL that we're done */
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
/* cpc complete unlock the endpoint */
}
break;
case MCA_BTL_IB_WAITING_ACK:
/* Tell main BTL that we're done */
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
/* cpc complete unlock the endpoint */
break;
case MCA_BTL_IB_CONNECT_ACK:
send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK);
/* Tell main BTL that we're done */
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
/* cpc complete unlock the endpoint */
break;
case MCA_BTL_IB_CONNECTED:
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
default :
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
}
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
}
}

Просмотреть файл

@ -816,6 +816,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) {
BTL_ERROR(("Failed to post on XRC SRQs"));
mca_btl_openib_endpoint_invoke_error(NULL);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
/* we should create qp and send the info + srq to requestor */
@ -823,6 +824,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(NULL);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
/* enable pooling for this btl */
@ -853,6 +855,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
@ -863,6 +866,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != rc) {
BTL_ERROR(("error in endpoint reply start connect"));
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
@ -897,10 +901,11 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
if (OMPI_SUCCESS != xoob_send_qp_connect(ib_endpoint, &rem_info)) {
BTL_ERROR(("Failed to connect endpoint\n"));
mca_btl_openib_endpoint_invoke_error(NULL);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
/* cpc complete unlock the endpoint */
break;
case ENDPOINT_XOOB_CONNECT_XRC_RESPONSE:
BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_RESPONSE: lid %d, sid %" PRIx64 "\n",
@ -919,7 +924,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
/* we got srq numbers on our request */
XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info);
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
/* cpc complete unlock the endpoint */
break;
case ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE:
/* The XRC recv site already was destroyed so we need