- corrected locking in gm btl - gm api is not thread safe

- initial support for gm progress thread - corrected threading issue in pml - added polling progress for a configurable number of cycles to wait for threaded case This commit was SVN r9188.
2006-03-02 00:39:07 +00:00 · 2006-03-02 00:39:07 +00:00 · 8bf6ed7a36
--- a/ompi/mca/btl/gm/btl_gm.c
+++ b/ompi/mca/btl/gm/btl_gm.c
@ -33,6 +33,28 @@
 #include "ompi/mca/mpool/mpool.h" 
 #include "ompi/proc/proc.h"

+
+/**
+ * Non-locking versions of public interfaces.
+ */
+
+static int mca_btl_gm_send_nl( 
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    struct mca_btl_base_descriptor_t* des, 
+    mca_btl_base_tag_t tag);
+
+static int mca_btl_gm_get_nl( 
+    mca_btl_base_module_t* btl,
+    mca_btl_base_endpoint_t* endpoint,
+    mca_btl_base_descriptor_t* des);
+
+static int mca_btl_gm_put_nl( 
+    mca_btl_base_module_t* btl,
+    mca_btl_base_endpoint_t* endpoint,
+    mca_btl_base_descriptor_t* des);
+
+
 mca_btl_gm_module_t mca_btl_gm_module = {
    {
        &mca_btl_gm_component.super,
@ -53,9 +75,15 @@ mca_btl_gm_module_t mca_btl_gm_module = {
        mca_btl_gm_free, 
        mca_btl_gm_prepare_src,
        mca_btl_gm_prepare_dst,
+#if OMPI_ENABLE_MPI_THREADS || OMPI_ENABLE_PROGRESS_THREADS
        mca_btl_gm_send,
        mca_btl_gm_put,
-        NULL /* get */ 
+        mca_btl_gm_get
+#else
+        mca_btl_gm_send_nl,
+        mca_btl_gm_put_nl,
+        mca_btl_gm_get_nl
+#endif
    }
 };

@ -438,7 +466,8 @@ static void mca_btl_gm_drop_callback( struct gm_port* port, void* context, gm_st
 }

 /**
- * Callback on send completion and/or error
+ * Callback on send completion and/or error.
+ * Called with mca_btl_gm_component.gm_lock held.
 */

 static void mca_btl_gm_send_callback( struct gm_port* port, void* context, gm_status_t status )
@ -461,18 +490,20 @@ static void mca_btl_gm_send_callback( struct gm_port* port, void* context, gm_st
            );

            /* retry the failed fragment */
-            mca_btl_gm_send(&btl->super, frag->endpoint, &frag->base, frag->hdr->tag);
+            mca_btl_gm_send_nl(&btl->super, frag->endpoint, &frag->base, frag->hdr->tag);
            break;
        case GM_SEND_DROPPED:
            /* release the send token */
            OPAL_THREAD_ADD32(&btl->gm_num_send_tokens, 1);

            /* retry the dropped fragment */
-            mca_btl_gm_send(&btl->super, frag->endpoint, &frag->base, frag->hdr->tag);
+            mca_btl_gm_send_nl(&btl->super, frag->endpoint, &frag->base, frag->hdr->tag);
            break;
        case GM_SUCCESS:
            /* call the completion callback */
+            OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
            frag->base.des_cbfunc(&btl->super, frag->endpoint, &frag->base, OMPI_SUCCESS);
+            OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);

            /* return the send token and deque pending fragments */
            MCA_BTL_GM_RETURN_TOKEN(btl);
@ -486,12 +517,78 @@ static void mca_btl_gm_send_callback( struct gm_port* port, void* context, gm_st
            OPAL_THREAD_ADD32( &btl->gm_num_send_tokens, 1 );

            /* call the completion callback */
+            OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
            frag->base.des_cbfunc(&btl->super, frag->endpoint, &frag->base, OMPI_ERROR);
+            OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
            break;
    }
 }


+/**
+ * Initiate an asynchronous send. Do NOT acquire gm lock, must already be held,
+ * or in an unthreaded environment.
+ *
+ * @param btl (IN)         BTL module
+ * @param endpoint (IN)    BTL addressing information
+ * @param descriptor (IN)  Description of the data to be transfered
+ * @param tag (IN)         The tag value used to notify the peer.
+ */
+
+
+static int mca_btl_gm_send_nl( 
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    struct mca_btl_base_descriptor_t* des, 
+    mca_btl_base_tag_t tag)
+{
+    mca_btl_gm_module_t* gm_btl = (mca_btl_gm_module_t*) btl;
+    mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*)des; 
+
+    frag->btl = gm_btl;
+    frag->endpoint = endpoint; 
+    frag->hdr->tag = tag;
+    frag->type = MCA_BTL_GM_SEND;
+
+    /* queue the descriptor if there are no send tokens */
+    MCA_BTL_GM_ACQUIRE_TOKEN_NL(gm_btl, frag);
+
+    /* post the send descriptor */
+    if(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY &&  
+       frag->size == mca_btl_gm_component.gm_eager_frag_size) {
+        gm_send_with_callback(
+            gm_btl->port,
+            frag->hdr,
+            mca_btl_gm_component.gm_eager_frag_size,
+            frag->segment.seg_len + sizeof(mca_btl_base_header_t),
+            GM_HIGH_PRIORITY,
+            endpoint->endpoint_addr.node_id,
+            endpoint->endpoint_addr.port_id,
+            mca_btl_gm_send_callback,
+            frag);
+    } else {
+        gm_send_with_callback(
+            gm_btl->port,
+            frag->hdr,
+            mca_btl_gm_component.gm_max_frag_size,
+            frag->segment.seg_len + sizeof(mca_btl_base_header_t),
+            GM_LOW_PRIORITY,
+            endpoint->endpoint_addr.node_id,
+            endpoint->endpoint_addr.port_id,
+            mca_btl_gm_send_callback,
+            frag);
+    }
+
+    if(opal_list_get_size(&gm_btl->gm_repost)) {
+        mca_btl_gm_frag_t* frag;
+        while(NULL != (frag = (mca_btl_gm_frag_t*)opal_list_remove_first(&gm_btl->gm_repost))) {
+            gm_provide_receive_buffer(gm_btl->port, frag->hdr, frag->size, frag->priority);
+        }
+    }
+    return OMPI_SUCCESS;
+}
+
+
 /**
 * Initiate an asynchronous send.
 *
@ -517,6 +614,7 @@ int mca_btl_gm_send(
    frag->type = MCA_BTL_GM_SEND;

    /* queue the descriptor if there are no send tokens */
+    OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
    MCA_BTL_GM_ACQUIRE_TOKEN(gm_btl, frag);
    
    /* post the send descriptor */
@ -547,19 +645,19 @@ int mca_btl_gm_send(

    if(opal_list_get_size(&gm_btl->gm_repost)) {
        mca_btl_gm_frag_t* frag;
-        OPAL_THREAD_LOCK(&gm_btl->gm_lock);
        while(NULL != (frag = (mca_btl_gm_frag_t*)opal_list_remove_first(&gm_btl->gm_repost))) {
            gm_provide_receive_buffer(gm_btl->port, frag->hdr, frag->size, frag->priority);
        }
-        OPAL_THREAD_UNLOCK(&gm_btl->gm_lock);
    }
+    OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
    return OMPI_SUCCESS;
 }



 /**
- * Callback on put completion and/or error
+ * Callback on put completion and/or error.
+ * Called with mca_btl_gm_component.gm_lock held.
 */

 static void mca_btl_gm_put_callback( struct gm_port* port, void* context, gm_status_t status )
@ -573,6 +671,7 @@ static void mca_btl_gm_put_callback( struct gm_port* port, void* context, gm_sta
        case GM_SEND_TIMED_OUT:
        case GM_TIMED_OUT:
            /* drop all sends to this destination port */
+
            gm_drop_sends(
                btl->port,
                (frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY) ? GM_HIGH_PRIORITY : GM_LOW_PRIORITY,
@ -583,18 +682,20 @@ static void mca_btl_gm_put_callback( struct gm_port* port, void* context, gm_sta
            );

            /* retry the failed fragment */
-            mca_btl_gm_put(&btl->super, frag->endpoint, &frag->base);
+            mca_btl_gm_put_nl(&btl->super, frag->endpoint, &frag->base);
            break;
        case GM_SEND_DROPPED:
            /* release the send token */
            OPAL_THREAD_ADD32(&btl->gm_num_send_tokens, 1);

            /* retry the dropped fragment */
-            mca_btl_gm_put(&btl->super, frag->endpoint, &frag->base);
+            mca_btl_gm_put_nl(&btl->super, frag->endpoint, &frag->base);
            break;
        case GM_SUCCESS:
            /* call completion callback */
+            OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
            frag->base.des_cbfunc(&btl->super, frag->endpoint, &frag->base, OMPI_SUCCESS);
+            OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);

            /* return the send token and deque pending fragments */
            MCA_BTL_GM_RETURN_TOKEN(btl);
@ -607,12 +708,55 @@ static void mca_btl_gm_put_callback( struct gm_port* port, void* context, gm_sta
            OPAL_THREAD_ADD32( &btl->gm_num_send_tokens, 1 );

            /* call the completion callback */
+            OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
            frag->base.des_cbfunc(&btl->super, frag->endpoint, &frag->base, OMPI_ERROR);
+            OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
            break;
    }
 }


+/**
+ * Initiate an asynchronous put. Do not acquire lock.
+ *
+ * @param btl (IN)         BTL module
+ * @param endpoint (IN)    BTL addressing information
+ * @param descriptor (IN)  Description of the data to be transferred
+ */
+
+static int mca_btl_gm_put_nl( 
+    mca_btl_base_module_t* btl,
+    mca_btl_base_endpoint_t* endpoint,
+    mca_btl_base_descriptor_t* des)
+{
+#if OMPI_MCA_BTL_GM_HAVE_RDMA_PUT
+    mca_btl_gm_module_t* gm_btl = (mca_btl_gm_module_t*) btl;
+    mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*) des; 
+
+    frag->btl = gm_btl;
+    frag->endpoint = endpoint;
+    frag->type = MCA_BTL_GM_PUT;
+
+    /* queue the descriptor if there are no send tokens */
+    MCA_BTL_GM_ACQUIRE_TOKEN_NL(gm_btl, frag);
+
+    /* post the put descriptor */
+    gm_put(gm_btl->port,
+        des->des_src->seg_addr.pval,
+        des->des_dst->seg_addr.lval,
+        des->des_src->seg_len,
+        GM_LOW_PRIORITY,
+        endpoint->endpoint_addr.node_id,
+        endpoint->endpoint_addr.port_id,
+        mca_btl_gm_put_callback,
+        frag);
+    return OMPI_SUCCESS; 
+#else
+    return OMPI_ERR_NOT_IMPLEMENTED;
+#endif
+}
+
+
 /**
 * Initiate an asynchronous put.
 *
@ -635,6 +779,7 @@ int mca_btl_gm_put(
    frag->type = MCA_BTL_GM_PUT;

    /* queue the descriptor if there are no send tokens */
+    OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
    MCA_BTL_GM_ACQUIRE_TOKEN(gm_btl, frag);

    /* post the put descriptor */
@ -647,6 +792,7 @@ int mca_btl_gm_put(
        endpoint->endpoint_addr.port_id,
        mca_btl_gm_put_callback,
        frag);
+    OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
    return OMPI_SUCCESS; 
 #else
    return OMPI_ERR_NOT_IMPLEMENTED;
@ -656,7 +802,8 @@ int mca_btl_gm_put(


 /**
- * Callback on get completion and/or error
+ * Callback on get completion and/or error. 
+ * Called with mca_btl_gm_component.gm_lock held.
 */

 static void mca_btl_gm_get_callback( struct gm_port* port, void* context, gm_status_t status )
@ -680,18 +827,20 @@ static void mca_btl_gm_get_callback( struct gm_port* port, void* context, gm_sta
            );

            /* retry the failed fragment */
-            mca_btl_gm_get(&btl->super, frag->endpoint, &frag->base);
+            mca_btl_gm_get_nl(&btl->super, frag->endpoint, &frag->base);
            break;
        case GM_SEND_DROPPED:
            /* release the send token */
            OPAL_THREAD_ADD32(&btl->gm_num_send_tokens, 1);

            /* retry the dropped fragment */
-            mca_btl_gm_get(&btl->super, frag->endpoint, &frag->base);
+            mca_btl_gm_get_nl(&btl->super, frag->endpoint, &frag->base);
            break;
        case GM_SUCCESS:
            /* call completion callback */
+            OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
            frag->base.des_cbfunc(&btl->super, frag->endpoint, &frag->base, OMPI_SUCCESS);
+            OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);

            /* return the send token and deque pending fragments */
            MCA_BTL_GM_RETURN_TOKEN(btl);
@ -704,12 +853,55 @@ static void mca_btl_gm_get_callback( struct gm_port* port, void* context, gm_sta
            OPAL_THREAD_ADD32( &btl->gm_num_send_tokens, 1 );

            /* call the completion callback */
+            OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
            frag->base.des_cbfunc(&btl->super, frag->endpoint, &frag->base, OMPI_ERROR);
+            OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
            break;
    }
 }


+/**
+ * Initiate an asynchronous get. No locking.
+ *
+ * @param btl (IN)         BTL module
+ * @param endpoint (IN)    BTL addressing information
+ * @param descriptor (IN)  Description of the data to be transferred
+ *
+ */
+
+static int mca_btl_gm_get_nl( 
+    mca_btl_base_module_t* btl,
+    mca_btl_base_endpoint_t* endpoint,
+    mca_btl_base_descriptor_t* des)
+{
+#if OMPI_MCA_BTL_GM_HAVE_RDMA_GET
+    mca_btl_gm_module_t* gm_btl = (mca_btl_gm_module_t*) btl;
+    mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*) des; 
+
+    frag->btl = gm_btl;
+    frag->endpoint = endpoint;
+    frag->type = MCA_BTL_GM_GET;
+
+    /* queue the descriptor if there are no send tokens */
+    MCA_BTL_GM_ACQUIRE_TOKEN_NL(gm_btl, frag);
+
+    /* post get put descriptor */
+    gm_get(gm_btl->port,
+        des->des_dst->seg_addr.lval,
+        des->des_src->seg_addr.pval,
+        des->des_src->seg_len,
+        GM_LOW_PRIORITY,
+        endpoint->endpoint_addr.node_id,
+        endpoint->endpoint_addr.port_id,
+        mca_btl_gm_get_callback,
+        frag);
+    return OMPI_SUCCESS; 
+#else
+    return OMPI_ERR_NOT_IMPLEMENTED;
+#endif
+}
+

 /**
 * Initiate an asynchronous get.
@ -734,6 +926,7 @@ int mca_btl_gm_get(
    frag->type = MCA_BTL_GM_GET;

    /* queue the descriptor if there are no send tokens */
+    OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
    MCA_BTL_GM_ACQUIRE_TOKEN(gm_btl, frag);

    /* post get put descriptor */
@ -746,6 +939,7 @@ int mca_btl_gm_get(
        endpoint->endpoint_addr.port_id,
        mca_btl_gm_get_callback,
        frag);
+    OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
    return OMPI_SUCCESS; 
 #else
    return OMPI_ERR_NOT_IMPLEMENTED;
@ -782,7 +976,6 @@ int mca_btl_gm_finalize(struct mca_btl_base_module_t* btl)
    }
 #endif

-    OBJ_DESTRUCT(&gm_btl->gm_lock);
    OBJ_DESTRUCT(&gm_btl->gm_frag_eager);
    OBJ_DESTRUCT(&gm_btl->gm_frag_max);
    OBJ_DESTRUCT(&gm_btl->gm_frag_user);
--- a/ompi/mca/btl/gm/btl_gm.h
+++ b/ompi/mca/btl/gm/btl_gm.h
@ -107,7 +107,10 @@ struct mca_btl_gm_module_t {
    opal_list_t gm_pending; /**< list of pending send descriptors */
    opal_list_t gm_repost; /**< list of pending fragments */
    opal_list_t gm_mru_reg; /**< list of most recently used registrations */
-    opal_mutex_t gm_lock;
+
+#if OMPI_ENABLE_PROGRESS_THREADS
+    opal_thread_t gm_thread;
+#endif
 }; 
 typedef struct mca_btl_gm_module_t mca_btl_gm_module_t;
 extern mca_btl_gm_module_t mca_btl_gm_module;
@ -318,20 +321,31 @@ extern mca_btl_base_descriptor_t* mca_btl_gm_prepare_dst(
 * Acquire a send token - queue the fragment if none available
 */

+#define MCA_BTL_GM_ACQUIRE_TOKEN_NL(btl, frag)                                                  \
+do {                                                                                            \
+    /* queue the descriptor if there are no send tokens */                                      \
+    if(OPAL_THREAD_ADD32(&gm_btl->gm_num_send_tokens, -1) < 0) {                                \
+        opal_list_append(&gm_btl->gm_pending, (opal_list_item_t*)frag);                         \
+        OPAL_THREAD_ADD32(&gm_btl->gm_num_send_tokens, 1);                                      \
+        return OMPI_SUCCESS;                                                                    \
+    }                                                                                           \
+} while (0)                                                                                     \
+
+
 #define MCA_BTL_GM_ACQUIRE_TOKEN(btl, frag)                                                     \
 do {                                                                                            \
    /* queue the descriptor if there are no send tokens */                                      \
    if(OPAL_THREAD_ADD32(&gm_btl->gm_num_send_tokens, -1) < 0) {                                \
-        OPAL_THREAD_LOCK(&gm_btl->gm_lock);                                                     \
        opal_list_append(&gm_btl->gm_pending, (opal_list_item_t*)frag);                         \
-        OPAL_THREAD_UNLOCK(&gm_btl->gm_lock);                                                   \
        OPAL_THREAD_ADD32(&gm_btl->gm_num_send_tokens, 1);                                      \
+        OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);                                      \
        return OMPI_SUCCESS;                                                                    \
    }                                                                                           \
 } while (0)                                                                                     \

 /**
 * Return send token and dequeue and pending fragments 
+ * mca_btl_gm_component.gm_lock is already held.
 */

 #define MCA_BTL_GM_RETURN_TOKEN(btl)                                                            \
@ -339,19 +353,17 @@ do {
   OPAL_THREAD_ADD32( &btl->gm_num_send_tokens, 1 );                                            \
   if(opal_list_get_size(&btl->gm_pending)) {                                                   \
       mca_btl_gm_frag_t* frag;                                                                 \
-       OPAL_THREAD_LOCK(&btl->gm_lock);                                                         \
       frag = (mca_btl_gm_frag_t*)opal_list_remove_first(&btl->gm_pending);                     \
-       OPAL_THREAD_UNLOCK(&btl->gm_lock);                                                       \
       if(NULL != frag) {                                                                       \
           switch(frag->type) {                                                                 \
           case MCA_BTL_GM_SEND:                                                                \
-               mca_btl_gm_send(&btl->super, frag->endpoint, &frag->base, frag->hdr->tag);       \
+               mca_btl_gm_send_nl(&btl->super, frag->endpoint, &frag->base, frag->hdr->tag);    \
               break;                                                                           \
           case MCA_BTL_GM_PUT:                                                                 \
-               mca_btl_gm_put(&btl->super, frag->endpoint, &frag->base);                        \
+               mca_btl_gm_put_nl(&btl->super, frag->endpoint, &frag->base);                     \
               break;                                                                           \
           case MCA_BTL_GM_GET:                                                                 \
-               mca_btl_gm_get(&btl->super, frag->endpoint, &frag->base);                        \
+               mca_btl_gm_get_nl(&btl->super, frag->endpoint, &frag->base);                     \
               break;                                                                           \
           }                                                                                    \
       }                                                                                        \
--- a/ompi/mca/btl/gm/btl_gm_component.c
+++ b/ompi/mca/btl/gm/btl_gm_component.c
@ -25,6 +25,7 @@
 #include "opal/util/output.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/mca/btl/btl.h"
+#include "ompi/request/request.h"

 #include "opal/mca/base/mca_base_param.h"
 #include "orte/mca/errmgr/errmgr.h"
@ -40,6 +41,12 @@
 #include "orte/util/proc_info.h"
 #include "ompi/mca/pml/base/pml_base_module_exchange.h"

+
+#if OMPI_ENABLE_PROGRESS_THREADS
+static void* mca_btl_gm_progress_thread( opal_object_t* arg );
+#endif
+
+
 mca_btl_gm_component_t mca_btl_gm_component = {
    {
        /* First, the mca_base_component_t struct containing meta information
@ -214,7 +221,7 @@ mca_btl_gm_module_init (mca_btl_gm_module_t * btl)
    OBJ_CONSTRUCT(&btl->gm_pending, opal_list_t);
    OBJ_CONSTRUCT(&btl->gm_repost, opal_list_t);
    OBJ_CONSTRUCT(&btl->gm_mru_reg, opal_list_t);
-    OBJ_CONSTRUCT(&btl->gm_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&btl->gm_thread, opal_thread_t);
                                                                                                  
    /* query nic tokens */
    btl->gm_num_send_tokens = gm_num_send_tokens (btl->port);
@ -304,6 +311,16 @@ mca_btl_gm_module_init (mca_btl_gm_module_t * btl)
        opal_output (0, "[%s:%d] unable to allow remote memory access", __FILE__, __LINE__);
        return OMPI_ERROR;
    }
+
+#if OMPI_ENABLE_PROGRESS_THREADS
+    /* start progress thread */
+    btl->gm_thread.t_run = mca_btl_gm_progress_thread;
+    btl->gm_thread.t_arg = btl;
+    if(OPAL_SUCCESS != (rc = opal_thread_start(&btl->gm_thread))) {
+        opal_output (0, "[%s:%d] unable to create progress thread, retval=%d", __FILE__, __LINE__, rc);
+        return rc;
+    }
+#endif
    return OMPI_SUCCESS;
 }

@ -324,7 +341,7 @@ static int mca_btl_gm_discover( void )
    char  global_id[GM_MAX_HOST_NAME_LEN];
 #endif  /* GM_API_VERSION > 0x200 */
    int rc;
-
+    
    for( board_no = 0; board_no < mca_btl_gm_component.gm_max_boards; board_no++ ) {
        mca_btl_gm_module_t *btl;

@ -446,11 +463,14 @@ mca_btl_gm_component_init (int *num_btl_modules,
    mca_btl_base_module_t **btls;
    *num_btl_modules = 0;

+    OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+
    /* try to initialize GM */
    if( GM_SUCCESS != gm_init() ) {
        opal_output( 0, "[%s:%d] error in initializing the gm library\n", __FILE__, __LINE__ );
        mca_btl_gm_component.gm_num_btls = 0;
        mca_btl_gm_modex_send();
+        OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
        return NULL;
    }

@ -458,6 +478,7 @@ mca_btl_gm_component_init (int *num_btl_modules,
    mca_btl_gm_component.gm_btls = malloc( mca_btl_gm_component.gm_max_btls * sizeof (mca_btl_gm_module_t *));
    if (NULL == mca_btl_gm_component.gm_btls) {
        opal_output( 0, "[%s:%d] out of resources.", __FILE__, __LINE__ );
+        OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
        return NULL;
    }

@ -466,17 +487,20 @@ mca_btl_gm_component_init (int *num_btl_modules,
        mca_btl_base_error_no_nics("Myrinet/GM", "NIC");
        mca_btl_gm_component.gm_num_btls = 0;
        mca_btl_gm_modex_send();
+        OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
        return NULL;
    }
    if (mca_btl_gm_component.gm_num_btls == 0) {
        mca_btl_base_error_no_nics("Myrinet/GM", "NIC");
        mca_btl_gm_component.gm_num_btls = 0;
        mca_btl_gm_modex_send();
+        OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
        return NULL;
    }

    /* publish GM parameters with the MCA framework */
    if (OMPI_SUCCESS != mca_btl_gm_modex_send()) {
+        OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
        return NULL;
    }

@ -484,12 +508,14 @@ mca_btl_gm_component_init (int *num_btl_modules,
    btls = (mca_btl_base_module_t**) malloc (
                mca_btl_gm_component.gm_num_btls * sizeof(mca_btl_base_module_t *));
    if (NULL == btls) {
+        OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
        return NULL;
    }

    memcpy(btls, mca_btl_gm_component.gm_btls,
           mca_btl_gm_component.gm_num_btls * sizeof(mca_btl_gm_module_t *));
    *num_btl_modules = mca_btl_gm_component.gm_num_btls;
+    OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
    return btls;
 }
                                                                                                                
@ -511,12 +537,10 @@ int mca_btl_gm_component_progress()
        return OMPI_SUCCESS;
    }

+    OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
    for( i = 0; i < mca_btl_gm_component.gm_num_btls; ) {
        mca_btl_gm_module_t* btl = mca_btl_gm_component.gm_btls[i];
        gm_recv_event_t* event = gm_receive(btl->port);
-        unsigned char* buffer = (unsigned char*)gm_ntohp(event->recv.buffer);
-        mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*)(buffer - sizeof(mca_btl_gm_frag_t));
-        mca_btl_base_header_t* hdr;

        /* If there are no receive events just skip the function call */
        switch(gm_ntohc(event->recv.type)) {
@ -525,12 +549,18 @@ int mca_btl_gm_component_progress()
            case GM_FAST_HIGH_RECV_EVENT:
            case GM_FAST_HIGH_PEER_RECV_EVENT:
                {
+                unsigned char* buffer = (unsigned char*)gm_ntohp(event->recv.buffer);
+                mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*)(buffer - sizeof(mca_btl_gm_frag_t));
+                mca_btl_base_header_t* hdr = (mca_btl_base_header_t *)gm_ntohp(event->recv.message);
                mca_btl_base_recv_reg_t* reg;
-                hdr = (mca_btl_base_header_t *)gm_ntohp(event->recv.message);
                frag->segment.seg_addr.pval = (hdr+1);
                frag->segment.seg_len = gm_ntohl(event->recv.length) - sizeof(mca_btl_base_header_t);
                reg = &btl->gm_reg[hdr->tag];
+
+                OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
                reg->cbfunc(&btl->super, hdr->tag, &frag->base, reg->cbdata);
+                OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+
                MCA_BTL_GM_FRAG_POST(btl,frag);
                count++;
                break;
@ -540,12 +570,18 @@ int mca_btl_gm_component_progress()
            case GM_HIGH_RECV_EVENT:
            case GM_HIGH_PEER_RECV_EVENT:
                {
+                unsigned char* buffer = (unsigned char*)gm_ntohp(event->recv.buffer);
+                mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*)(buffer - sizeof(mca_btl_gm_frag_t));
+                mca_btl_base_header_t* hdr = (mca_btl_base_header_t*)buffer;
                mca_btl_base_recv_reg_t* reg;
-                hdr = (mca_btl_base_header_t*)buffer;
                frag->segment.seg_addr.pval = (hdr+1);
                frag->segment.seg_len = gm_ntohl(event->recv.length) - sizeof(mca_btl_base_header_t);
                reg = &btl->gm_reg[hdr->tag];
+
+                OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
                reg->cbfunc(&btl->super, hdr->tag, &frag->base, reg->cbdata);
+                OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+
                MCA_BTL_GM_FRAG_POST(btl,frag);
                count++;
                break;
@ -558,7 +594,90 @@ int mca_btl_gm_component_progress()
                break;
        }
    }
+    OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
    OPAL_THREAD_ADD32(&inprogress, -1);
    return count;
 }

+
+#if OMPI_ENABLE_PROGRESS_THREADS
+static void* mca_btl_gm_progress_thread( opal_object_t* arg )
+{
+    opal_thread_t* thread = (opal_thread_t*)arg;
+    mca_btl_gm_module_t* btl = thread->t_arg;
+
+    /* This thread enter in a cancel enabled state */
+    pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
+    pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
+
+    OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+    while(1) {
+        gm_recv_event_t* event;
+
+        /* dont process events while the app is in the library */
+        while(opal_progress_threads()) {
+            OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
+            while(opal_progress_threads())
+               sched_yield();
+            usleep(100); /* give app a chance to re-enter library */
+            OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+        }
+
+        /* otherwise processes any pending events */
+        event = gm_blocking_receive_no_spin(btl->port);
+        switch(gm_ntohc(event->recv.type)) {
+            case GM_FAST_RECV_EVENT:
+            case GM_FAST_PEER_RECV_EVENT:
+            case GM_FAST_HIGH_RECV_EVENT:
+            case GM_FAST_HIGH_PEER_RECV_EVENT:
+                {
+                unsigned char* buffer = (unsigned char*)gm_ntohp(event->recv.buffer);
+                mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*)(buffer - sizeof(mca_btl_gm_frag_t));
+                mca_btl_base_header_t* hdr = (mca_btl_base_header_t *)gm_ntohp(event->recv.message);
+                mca_btl_base_recv_reg_t* reg;
+                frag->segment.seg_addr.pval = (hdr+1);
+                frag->segment.seg_len = gm_ntohl(event->recv.length) - sizeof(mca_btl_base_header_t);
+                reg = &btl->gm_reg[hdr->tag];
+
+                OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
+                reg->cbfunc(&btl->super, hdr->tag, &frag->base, reg->cbdata);
+                OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+
+                MCA_BTL_GM_FRAG_POST(btl,frag);
+                break;
+                }
+            case GM_RECV_EVENT:
+            case GM_PEER_RECV_EVENT:
+            case GM_HIGH_RECV_EVENT:
+            case GM_HIGH_PEER_RECV_EVENT:
+                {
+                unsigned char* buffer = (unsigned char*)gm_ntohp(event->recv.buffer);
+                mca_btl_gm_frag_t* frag = (mca_btl_gm_frag_t*)(buffer - sizeof(mca_btl_gm_frag_t));
+                mca_btl_base_header_t* hdr = (mca_btl_base_header_t*)buffer;
+                mca_btl_base_recv_reg_t* reg;
+                frag->segment.seg_addr.pval = (hdr+1);
+                frag->segment.seg_len = gm_ntohl(event->recv.length) - sizeof(mca_btl_base_header_t);
+                reg = &btl->gm_reg[hdr->tag];
+
+                OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
+                reg->cbfunc(&btl->super, hdr->tag, &frag->base, reg->cbdata);
+                OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+
+                MCA_BTL_GM_FRAG_POST(btl,frag);
+                break;
+                }
+            case _GM_SLEEP_EVENT:
+                OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
+                gm_unknown(btl->port, event);
+                OPAL_THREAD_LOCK(&mca_btl_gm_component.gm_lock);
+                break;
+            default:
+                gm_unknown(btl->port, event);
+                break;
+        }
+    }
+    OPAL_THREAD_UNLOCK(&mca_btl_gm_component.gm_lock);
+    return PTHREAD_CANCELED;
+}
+#endif
+                                                                                                                                       
--- a/ompi/mca/btl/gm/btl_gm_frag.h
+++ b/ompi/mca/btl/gm/btl_gm_frag.h
@ -115,19 +115,17 @@ OBJ_CLASS_DECLARATION(mca_btl_gm_frag_user_t);
        (opal_list_item_t*)(frag)); \
 }

+
+/* called with mca_btl_gm_component.gm_lock held */
                                                                                                       
 #define MCA_BTL_GM_FRAG_POST(btl,frag) \
 do { \
    if(opal_list_get_size(&btl->gm_repost) < (size_t)btl->gm_num_repost) { \
-        OPAL_THREAD_LOCK(&btl->gm_lock);  \
        opal_list_append(&btl->gm_repost, (opal_list_item_t*)frag); \
-        OPAL_THREAD_UNLOCK(&btl->gm_lock);  \
    } else { \
-        OPAL_THREAD_LOCK(&btl->gm_lock);  \
        do { \
            gm_provide_receive_buffer(btl->port, frag->hdr, frag->size, frag->priority); \
        } while (NULL != (frag = (mca_btl_gm_frag_t*)opal_list_remove_first(&btl->gm_repost)));  \
-        OPAL_THREAD_UNLOCK(&btl->gm_lock);  \
    } \
 } while(0) 

--- a/ompi/mca/btl/gm/btl_gm_proc.c
+++ b/ompi/mca/btl/gm/btl_gm_proc.c
@ -101,8 +101,8 @@ static mca_btl_gm_proc_t* mca_btl_gm_proc_lookup_ompi(ompi_proc_t* ompi_proc)
 mca_btl_gm_proc_t* mca_btl_gm_proc_create(ompi_proc_t* ompi_proc)
 {
    mca_btl_gm_proc_t* gm_proc = NULL;
-    size_t size;
-    int rc, i;
+    size_t i, size;
+    int rc;

    /* Check if we have already created a GM proc
     * structure for this ompi process */
--- a/ompi/mca/pml/ob1/pml_ob1_irecv.c
+++ b/ompi/mca/pml/ob1/pml_ob1_irecv.c
@ -88,6 +88,11 @@ int mca_pml_ob1_recv(void *addr,

    MCA_PML_OB1_RECV_REQUEST_START(recvreq);
    if (recvreq->req_recv.req_base.req_ompi.req_complete == false) {
+#if OMPI_ENABLE_PROGRESS_THREADS
+        if(opal_progress_spin(&recvreq->req_recv.req_base.req_ompi.req_complete)) {
+            goto finished;
+        }
+#endif
        /* give up and sleep until completion */
        if (opal_using_threads()) {
            opal_mutex_lock(&ompi_request_lock);
@ -104,6 +109,10 @@ int mca_pml_ob1_recv(void *addr,
        }
    }

+#if OMPI_ENABLE_PROGRESS_THREADS
+finished:
+#endif
+
    if (NULL != status) {  /* return status */
        *status = recvreq->req_recv.req_base.req_ompi.req_status;
    }
--- a/ompi/mca/pml/ob1/pml_ob1_isend.c
+++ b/ompi/mca/pml/ob1/pml_ob1_isend.c
@ -108,6 +108,15 @@ int mca_pml_ob1_send(void *buf,
    }

    if (sendreq->req_send.req_base.req_ompi.req_complete == false) {
+#if OMPI_ENABLE_PROGRESS_THREADS
+        if(opal_progress_spin(&sendreq->req_send.req_base.req_ompi.req_complete)) {
+            opal_mutex_lock(&ompi_request_lock);
+            MCA_PML_OB1_SEND_REQUEST_FREE( sendreq );
+            opal_mutex_unlock(&ompi_request_lock);
+            return OMPI_SUCCESS;
+        }
+#endif
+
        /* give up and sleep until completion */
        if (opal_using_threads()) {
            opal_mutex_lock(&ompi_request_lock);
@ -115,17 +124,16 @@ int mca_pml_ob1_send(void *buf,
            while (sendreq->req_send.req_base.req_ompi.req_complete == false)
                opal_condition_wait(&ompi_request_cond, &ompi_request_lock);
            ompi_request_waiting--;
+            MCA_PML_OB1_SEND_REQUEST_FREE( sendreq );
            opal_mutex_unlock(&ompi_request_lock);
        } else {
            ompi_request_waiting++;
            while (sendreq->req_send.req_base.req_ompi.req_complete == false)
                opal_condition_wait(&ompi_request_cond, &ompi_request_lock);
            ompi_request_waiting--;
+            MCA_PML_OB1_SEND_REQUEST_FREE( sendreq );
        }
    }
-
-    /* return request to pool */
-    MCA_PML_OB1_SEND_REQUEST_FREE( sendreq );
    return OMPI_SUCCESS;
 }

--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@ -39,6 +39,7 @@
 static int mca_pml_ob1_send_request_fini(struct ompi_request_t** request)
 {
    mca_pml_ob1_send_request_t* sendreq = *(mca_pml_ob1_send_request_t**)(request); 
+    OPAL_THREAD_LOCK(&ompi_request_lock);
    if(sendreq->req_send.req_base.req_persistent) {
       if(sendreq->req_send.req_base.req_free_called) {
           MCA_PML_OB1_SEND_REQUEST_FREE(sendreq);
@ -60,12 +61,15 @@ static int mca_pml_ob1_send_request_fini(struct ompi_request_t** request)
        MCA_PML_OB1_SEND_REQUEST_FREE(sendreq);
        *request = MPI_REQUEST_NULL;
    }
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
    return OMPI_SUCCESS;
 }

 static int mca_pml_ob1_send_request_free(struct ompi_request_t** request)
 {
+    OPAL_THREAD_LOCK(&ompi_request_lock);
    MCA_PML_OB1_SEND_REQUEST_FREE( *(mca_pml_ob1_send_request_t**)request );
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
    *request = MPI_REQUEST_NULL;
    return OMPI_SUCCESS;
 }
--- a/ompi/request/req_wait.c
+++ b/ompi/request/req_wait.c
@ -20,7 +20,43 @@
 #include "ompi/constants.h"
 #include "ompi/request/request.h"

-int ompi_request_poll_iterations = 20000;
+
+int ompi_request_wait(
+    ompi_request_t ** req_ptr,
+    ompi_status_public_t * status)
+{
+    ompi_request_t *req = *req_ptr;
+    if(req->req_complete == false) {
+                                                                                                      
+#if OMPI_ENABLE_PROGRESS_THREADS
+        /* poll for completion */
+        if(opal_progress_spin(&req->req_complete))
+            goto finished;
+#endif
+
+        /* give up and sleep until completion */
+        OPAL_THREAD_LOCK(&ompi_request_lock);
+        ompi_request_waiting++;
+        while (req->req_complete == false) {
+            opal_condition_wait(&ompi_request_cond, &ompi_request_lock);
+        }
+        ompi_request_waiting--;
+        OPAL_THREAD_UNLOCK(&ompi_request_lock);
+    }
+                                                                                                      
+#if OMPI_ENABLE_PROGRESS_THREADS
+finished:
+#endif
+                                                                                                      
+    /* return status */
+    if (MPI_STATUS_IGNORE != status) {
+        *status = req->req_status;
+    }
+                                                                                                      
+    /* return request to pool */
+    return req->req_fini(req_ptr);
+}
+

 int ompi_request_wait_any(
    size_t count,
@ -39,8 +75,8 @@ int ompi_request_wait_any(

 #if OMPI_ENABLE_PROGRESS_THREADS
    /* poll for completion */
-    opal_atomic_mb();
-    for (c = 0; completed < 0 && c < ompi_request_poll_iterations; c++) {
+    OPAL_THREAD_ADD32(&opal_progress_thread_count,1);
+    for (c = 0; completed < 0 && c < opal_progress_spin_count; c++) {
        rptr = requests;
        num_requests_null_inactive = 0;
        for (i = 0; i < count; i++, rptr++) {
@ -58,9 +94,13 @@ int ompi_request_wait_any(
                goto finished;
            }
        }
-        if( num_requests_null_inactive == count )
+        if( num_requests_null_inactive == count ) {
+            OPAL_THREAD_ADD32(&opal_progress_thread_count,-1);
            goto finished;
+        }
+        opal_progress();
    }
+    OPAL_THREAD_ADD32(&opal_progress_thread_count,-1);
 #endif

    /* give up and sleep until completion */
--- a/ompi/request/request.h
+++ b/ompi/request/request.h
@ -140,6 +140,7 @@ typedef struct ompi_request_t ompi_request_t;
 OMPI_DECLSPEC extern ompi_pointer_array_t  ompi_request_f_to_c_table;
 OMPI_DECLSPEC extern size_t                ompi_request_waiting;
 OMPI_DECLSPEC extern size_t                ompi_request_completed;
+OMPI_DECLSPEC extern int32_t               ompi_request_poll;
 OMPI_DECLSPEC extern opal_mutex_t          ompi_request_lock;
 OMPI_DECLSPEC extern opal_condition_t      ompi_request_cond;
 OMPI_DECLSPEC extern ompi_request_t        ompi_request_null;
@ -286,31 +287,9 @@ OMPI_DECLSPEC int ompi_request_test_all(
 *
 */

-static inline int ompi_request_wait(
+int ompi_request_wait(
    ompi_request_t ** req_ptr,
-    ompi_status_public_t * status)
-{
-    ompi_request_t *req = *req_ptr;
-    if(req->req_complete == false) {
-        /* give up and sleep until completion */
-        OPAL_THREAD_LOCK(&ompi_request_lock);
-        ompi_request_waiting++;
-        while (req->req_complete == false) {
-            opal_condition_wait(&ompi_request_cond, &ompi_request_lock);
-        }
-        ompi_request_waiting--;
-        OPAL_THREAD_UNLOCK(&ompi_request_lock);
-    }
-
-    /* return status */
-    if (MPI_STATUS_IGNORE != status) {
-        *status = req->req_status;
-    }
-
-    /* return request to pool */
-    return req->req_fini(req_ptr);
-}
-
+    ompi_status_public_t * status);

 /**
 * Wait (blocking-mode) for one of N requests to complete.
--- a/opal/runtime/opal_progress.c
+++ b/opal/runtime/opal_progress.c
@ -40,6 +40,11 @@ static const opal_timer_t opal_progress_default_tick_rate = 10000; /* 10ms */
 static const int opal_progress_default_tick_rate = 10000; /* 10k calls to opal_progress */
 #endif

+volatile int32_t opal_progress_thread_count = 0;
+int opal_progress_spin_count = 10000;
+                                                                                                          
+
+
 /*
 * Local variables
 */
--- a/opal/runtime/opal_progress.h
+++ b/opal/runtime/opal_progress.h
@ -27,6 +27,7 @@
 #if defined(c_plusplus) || defined(__cplusplus)
 extern "C" {
 #endif
+#include "opal/threads/mutex.h"

 /**
 * Initialize the progress engine
@ -89,7 +90,6 @@ OMPI_DECLSPEC extern void opal_progress(void);

 typedef int (*opal_progress_callback_t)(void);

-
 /**
 * Register an event to be progressed
 */
@ -112,6 +112,36 @@ OMPI_DECLSPEC int opal_progress_event_increment(void);
 */   
 OMPI_DECLSPEC int opal_progress_event_decrement(void);

+
+/**
+ * Progress until flag is true or poll iterations completed
+ */
+
+extern volatile int32_t opal_progress_thread_count;
+extern int opal_progress_spin_count;
+
+
+static inline bool opal_progress_threads(void) 
+{ 
+    return (opal_progress_thread_count > 0); 
+}
+
+
+static inline bool opal_progress_spin(volatile bool* complete)
+{
+    int32_t c;
+    OPAL_THREAD_ADD32(&opal_progress_thread_count,1);
+    for (c = 0; c < opal_progress_spin_count; c++) {
+        if (true == *complete) {
+             OPAL_THREAD_ADD32(&opal_progress_thread_count,-1);
+             return true;
+        }
+        opal_progress();
+    }
+    OPAL_THREAD_ADD32(&opal_progress_thread_count,-1);
+}
+
+
 #if defined(c_plusplus) || defined(__cplusplus)
 }
 #endif