change so that we only check connection queue when expecting a connection; create a mca parameter that controls frequency at which the async queue is checked
This commit was SVN r14511.
Этот коммит содержится в:
родитель
7d0f51e6b9
Коммит
80d984441f
@ -313,6 +313,10 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
|
||||
btl->udapl_eager_rdma_endpoint_count = 0;
|
||||
OBJ_CONSTRUCT(&btl->udapl_eager_rdma_lock, opal_mutex_t);
|
||||
|
||||
/* initialize miscellaneous variables */
|
||||
btl->udapl_async_events = 0;
|
||||
btl->udapl_connect_inprogress = 0;
|
||||
|
||||
/* TODO - Set up SRQ when it is supported */
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
|
@ -89,7 +89,7 @@ struct mca_btl_udapl_component_t {
|
||||
int32_t udapl_eager_rdma_win; /**< number of eager RDMA fragments
|
||||
recieved before returning credits to
|
||||
sender */
|
||||
|
||||
int32_t udapl_async_events; /**< dequeue asynchronous events */
|
||||
opal_list_t udapl_procs; /**< list of udapl proc structures */
|
||||
opal_mutex_t udapl_lock; /**< lock for accessing module state */
|
||||
char* udapl_mpool_name; /**< name of memory pool */
|
||||
@ -136,6 +136,9 @@ struct mca_btl_udapl_module_t {
|
||||
* with eager rdma
|
||||
* connections
|
||||
*/
|
||||
int32_t udapl_async_events;
|
||||
int32_t udapl_connect_inprogress;
|
||||
|
||||
/* module specific limits */
|
||||
int udapl_evd_qlen;
|
||||
int udapl_max_request_dtos; /**< maximum number of outstanding consumer
|
||||
|
@ -592,9 +592,6 @@ int mca_btl_udapl_component_progress()
|
||||
mca_btl_udapl_module_t* btl;
|
||||
static int32_t inprogress = 0;
|
||||
DAT_EVENT event;
|
||||
#if defined(__SVR4) && defined(__sun)
|
||||
DAT_COUNT nmore; /* used by dat_evd_wait, see comment below */
|
||||
#endif
|
||||
size_t i;
|
||||
int32_t j, rdma_ep_count;
|
||||
int count = 0;
|
||||
@ -809,17 +806,9 @@ int mca_btl_udapl_component_progress()
|
||||
}
|
||||
|
||||
/* Check connection EVD */
|
||||
while(DAT_SUCCESS ==
|
||||
#if defined(__SVR4) && defined(__sun)
|
||||
/* There is a bug is Solaris udapl implementation
|
||||
* such that dat_evd_dequeue does not dequeue
|
||||
* DAT_CONNECTION_REQUEST_EVENT. Workaround is to use
|
||||
* wait. This should be removed when fix available.
|
||||
*/
|
||||
dat_evd_wait(btl->udapl_evd_conn, 0, 1, &event, &nmore)) {
|
||||
#else
|
||||
dat_evd_dequeue(btl->udapl_evd_conn, &event)) {
|
||||
#endif
|
||||
while((btl->udapl_connect_inprogress > 0) && (DAT_SUCCESS ==
|
||||
dat_evd_dequeue(btl->udapl_evd_conn, &event))) {
|
||||
|
||||
switch(event.event_number) {
|
||||
case DAT_CONNECTION_REQUEST_EVENT:
|
||||
/* Accept a new connection */
|
||||
@ -857,22 +846,28 @@ int mca_btl_udapl_component_progress()
|
||||
}
|
||||
|
||||
/* Check async EVD */
|
||||
while(DAT_SUCCESS ==
|
||||
if (btl->udapl_async_events == mca_btl_udapl_component.udapl_async_events) {
|
||||
btl->udapl_async_events = 0;
|
||||
|
||||
while(DAT_SUCCESS ==
|
||||
dat_evd_dequeue(btl->udapl_evd_async, &event)) {
|
||||
|
||||
switch(event.event_number) {
|
||||
case DAT_ASYNC_ERROR_EVD_OVERFLOW:
|
||||
case DAT_ASYNC_ERROR_IA_CATASTROPHIC:
|
||||
case DAT_ASYNC_ERROR_EP_BROKEN:
|
||||
case DAT_ASYNC_ERROR_TIMED_OUT:
|
||||
case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR:
|
||||
BTL_OUTPUT(("WARNING: async event ignored : %d",
|
||||
event.event_number));
|
||||
break;
|
||||
default:
|
||||
BTL_OUTPUT(("WARNING unknown async event: %d\n",
|
||||
event.event_number));
|
||||
switch(event.event_number) {
|
||||
case DAT_ASYNC_ERROR_EVD_OVERFLOW:
|
||||
case DAT_ASYNC_ERROR_IA_CATASTROPHIC:
|
||||
case DAT_ASYNC_ERROR_EP_BROKEN:
|
||||
case DAT_ASYNC_ERROR_TIMED_OUT:
|
||||
case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR:
|
||||
BTL_OUTPUT(("WARNING: async event ignored : %d",
|
||||
event.event_number));
|
||||
break;
|
||||
default:
|
||||
BTL_OUTPUT(("WARNING unknown async event: %d\n",
|
||||
event.event_number));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
btl->udapl_async_events++;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -495,6 +495,8 @@ static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), 1);
|
||||
|
||||
/* Pack our address information */
|
||||
rc = orte_dss.pack(buf, &addr->port, 1, ORTE_UINT64);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -588,7 +590,8 @@ void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint)
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
|
||||
|
||||
OPAL_THREAD_ADD32(&(btl->udapl_connect_inprogress), 1);
|
||||
|
||||
/* Nasty test to prevent deadlock and unwanted connection attempts */
|
||||
/* This right here is the whole point of using the ORTE/RML handshake */
|
||||
if((MCA_BTL_UDAPL_CONN_EAGER == endpoint->endpoint_state &&
|
||||
@ -766,6 +769,7 @@ static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint)
|
||||
int rc;
|
||||
|
||||
endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTED;
|
||||
OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), -1);
|
||||
|
||||
/* post eager/max recv buffers */
|
||||
mca_btl_udapl_endpoint_post_recv(endpoint,
|
||||
|
@ -208,6 +208,13 @@ int mca_btl_udapl_register_mca_params(void)
|
||||
&mca_btl_udapl_component.udapl_eager_rdma_guarantee,
|
||||
REGINT_GE_ZERO), tmp_rc, rc);
|
||||
|
||||
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("async_events",
|
||||
"The asynchronous event queue will only be "
|
||||
"checked after entering progress this number of times.",
|
||||
100000000,
|
||||
&mca_btl_udapl_component.udapl_async_events,
|
||||
REGINT_GE_ONE), tmp_rc, rc);
|
||||
|
||||
/* register uDAPL module parameters */
|
||||
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("evd_qlen",
|
||||
"The event dispatcher queue length.",
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user