change so that we only check connection queue when expecting a connection; create a mca parameter that controls frequency at which the async queue is checked

This commit was SVN r14511.
2007-04-25 17:46:25 +00:00 · 2007-04-25 17:46:25 +00:00 · 80d984441f
--- a/ompi/mca/btl/udapl/btl_udapl.c
+++ b/ompi/mca/btl/udapl/btl_udapl.c
@ -313,6 +313,10 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
    btl->udapl_eager_rdma_endpoint_count = 0;
    OBJ_CONSTRUCT(&btl->udapl_eager_rdma_lock, opal_mutex_t);

+    /* initialize miscellaneous variables */
+    btl->udapl_async_events = 0;
+    btl->udapl_connect_inprogress = 0;
+    
    /* TODO - Set up SRQ when it is supported */
    return OMPI_SUCCESS;

--- a/ompi/mca/btl/udapl/btl_udapl.h
+++ b/ompi/mca/btl/udapl/btl_udapl.h
@ -89,7 +89,7 @@ struct mca_btl_udapl_component_t {
    int32_t udapl_eager_rdma_win; /**< number of eager RDMA fragments
                                    recieved before returning credits to
                                    sender */
-
+    int32_t udapl_async_events;  /**< dequeue asynchronous events */
    opal_list_t udapl_procs;   /**< list of udapl proc structures */
    opal_mutex_t udapl_lock;   /**< lock for accessing module state */
    char* udapl_mpool_name;    /**< name of memory pool */ 
@ -136,6 +136,9 @@ struct mca_btl_udapl_module_t {
                                                         * with eager rdma
                                                         * connections
                                                         */
+    int32_t udapl_async_events;
+    int32_t udapl_connect_inprogress;
+
    /* module specific limits */
    int udapl_evd_qlen;
    int udapl_max_request_dtos; /**< maximum number of outstanding consumer
--- a/ompi/mca/btl/udapl/btl_udapl_component.c
+++ b/ompi/mca/btl/udapl/btl_udapl_component.c
@ -592,9 +592,6 @@ int mca_btl_udapl_component_progress()
    mca_btl_udapl_module_t* btl;
    static int32_t inprogress = 0;
    DAT_EVENT event;
-#if defined(__SVR4) && defined(__sun)
-    DAT_COUNT nmore;  /* used by dat_evd_wait, see comment below */
-#endif
    size_t i;
    int32_t j, rdma_ep_count;
    int count = 0;
@ -809,17 +806,9 @@ int mca_btl_udapl_component_progress()
        }

        /* Check connection EVD */
-        while(DAT_SUCCESS ==
-#if defined(__SVR4) && defined(__sun)
-	    /* There is a bug is Solaris udapl implementation
-	     * such that dat_evd_dequeue does not dequeue
-	     * DAT_CONNECTION_REQUEST_EVENT. Workaround is to use
-	     * wait. This should be removed when fix available.
-	     */
-	    dat_evd_wait(btl->udapl_evd_conn, 0, 1, &event, &nmore)) {
-#else
-            dat_evd_dequeue(btl->udapl_evd_conn, &event)) {
-#endif
+        while((btl->udapl_connect_inprogress > 0) && (DAT_SUCCESS ==
+            dat_evd_dequeue(btl->udapl_evd_conn, &event))) {
+
            switch(event.event_number) {
                case DAT_CONNECTION_REQUEST_EVENT:
                    /* Accept a new connection */
@ -857,22 +846,28 @@ int mca_btl_udapl_component_progress()
        }

        /* Check async EVD */
-        while(DAT_SUCCESS ==
+        if (btl->udapl_async_events == mca_btl_udapl_component.udapl_async_events) {
+            btl->udapl_async_events = 0;
+
+            while(DAT_SUCCESS ==
                dat_evd_dequeue(btl->udapl_evd_async, &event)) {

-            switch(event.event_number) {
-            case DAT_ASYNC_ERROR_EVD_OVERFLOW:
-            case DAT_ASYNC_ERROR_IA_CATASTROPHIC:
-            case DAT_ASYNC_ERROR_EP_BROKEN:
-            case DAT_ASYNC_ERROR_TIMED_OUT:
-            case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR:
-                 BTL_OUTPUT(("WARNING: async event ignored : %d",
-                    event.event_number));
-                 break;
-            default:
-                BTL_OUTPUT(("WARNING unknown async event: %d\n",
-                    event.event_number));
+                switch(event.event_number) {
+                case DAT_ASYNC_ERROR_EVD_OVERFLOW:
+                case DAT_ASYNC_ERROR_IA_CATASTROPHIC:
+                case DAT_ASYNC_ERROR_EP_BROKEN:
+                case DAT_ASYNC_ERROR_TIMED_OUT:
+                case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR:
+                    BTL_OUTPUT(("WARNING: async event ignored : %d",
+                        event.event_number));
+                    break;
+                default:
+                    BTL_OUTPUT(("WARNING unknown async event: %d\n",
+                        event.event_number));
+                }
            }
+        } else {
+            btl->udapl_async_events++;
        }

        /*
--- a/ompi/mca/btl/udapl/btl_udapl_endpoint.c
+++ b/ompi/mca/btl/udapl/btl_udapl_endpoint.c
@ -495,6 +495,8 @@ static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint)
        return ORTE_ERR_OUT_OF_RESOURCE;
    }

+    OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), 1);
+
    /* Pack our address information */
    rc = orte_dss.pack(buf, &addr->port, 1, ORTE_UINT64);
    if(ORTE_SUCCESS != rc) {
@ -588,7 +590,8 @@ void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint)
    int rc;

    OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
-
+    OPAL_THREAD_ADD32(&(btl->udapl_connect_inprogress), 1);
+    
    /* Nasty test to prevent deadlock and unwanted connection attempts */
    /* This right here is the whole point of using the ORTE/RML handshake */
    if((MCA_BTL_UDAPL_CONN_EAGER == endpoint->endpoint_state &&
@ -766,6 +769,7 @@ static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint)
    int rc;

    endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTED;
+    OPAL_THREAD_ADD32(&(endpoint->endpoint_btl->udapl_connect_inprogress), -1);

    /* post eager/max recv buffers */
    mca_btl_udapl_endpoint_post_recv(endpoint,
--- a/ompi/mca/btl/udapl/btl_udapl_mca.c
+++ b/ompi/mca/btl/udapl/btl_udapl_mca.c
@ -208,6 +208,13 @@ int mca_btl_udapl_register_mca_params(void)
        &mca_btl_udapl_component.udapl_eager_rdma_guarantee,
        REGINT_GE_ZERO), tmp_rc, rc);

+    CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("async_events",
+        "The asynchronous event queue will only be "
+        "checked after entering progress this number of times.",
+        100000000,
+        &mca_btl_udapl_component.udapl_async_events,
+        REGINT_GE_ONE), tmp_rc, rc);
+
    /* register uDAPL module parameters */
    CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("evd_qlen",
        "The event dispatcher queue length.",