Mark send request complete on PML level only when absolutely sure there is

no more work associated with this request. No more outstanding completions or packets and send scheduling isn't running in another thread. This commit was SVN r16013.
2007-08-30 12:08:33 +00:00 · 2007-08-30 12:08:33 +00:00 · 0b0f9d14aa
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@ -134,13 +134,11 @@ void mca_pml_ob1_recv_frag_callback( mca_btl_base_module_t* btl,
                    hdr->hdr_ack.hdr_send_offset,
                    sendreq->req_send.req_bytes_packed -
                    hdr->hdr_ack.hdr_send_offset);
-            if(OPAL_THREAD_ADD32(&sendreq->req_state, 1) == 2 &&
-                    sendreq->req_bytes_delivered >=
-                    sendreq->req_send.req_bytes_packed) {
-                MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq);
-            } else {
+
+            OPAL_THREAD_ADD32(&sendreq->req_state, -1);
+
+            if(send_request_pml_complete_check(sendreq) == false)
                mca_pml_ob1_send_request_schedule(sendreq);
-            }
                
            break;
        }
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@ -113,9 +113,7 @@ static int mca_pml_ob1_send_request_free(struct ompi_request_t** request)
                             &(sendreq->req_send.req_base), PERUSE_SEND );

    if( true == sendreq->req_send.req_base.req_pml_complete ) {
-        /* don't free request if other thread running schedule */
-        if(OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1)
-            MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq );
+        MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq );
    }

    OPAL_THREAD_UNLOCK(&ompi_request_lock);
@ -182,7 +180,7 @@ mca_pml_ob1_match_completion_cache( struct mca_btl_base_module_t* btl,
    MCA_BML_BASE_BTL_DES_RETURN( bml_btl, descriptor ); 

    /* signal request completion */
-    MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq);
+    send_request_pml_complete(sendreq);

    /* check for pending requests */
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -217,7 +215,7 @@ mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl,
    mca_bml_base_free( bml_btl, descriptor ); 

    /* signal request completion */
-    MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq);
+    send_request_pml_complete(sendreq);

    /* check for pending requests */
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -258,32 +256,16 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
                                        sizeof(mca_pml_ob1_rendezvous_hdr_t),
                                        req_bytes_delivered );

-    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered,
-                           req_bytes_delivered);
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
+
    /* return the descriptor */
    mca_bml_base_free(bml_btl, descriptor); 

    /* advance the request */
-    if(OPAL_THREAD_ADD32(&sendreq->req_state, 1) == 2 &&
-       sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed) {
-        if(!sendreq->req_send.req_base.req_pml_complete){
-            /* We must check that completion hasn't already occured */
-            /*  for the self BTL we may choose the RDMA PUT protocol */
-            /*  on the send side, in  this case we send no eager data */
-            /*  if, on the receiver side the data is not contiguous we  */
-            /*  may choose to use the copy in/out protocol */
-            /*  if this occurs, the entire request can be completed in a */
-            /*  single call to mca_pml_ob1_recv_request_ack */
-            /*  as soon as the last fragment of the copy in/out protocol */
-            /*  gets local completion. This doesn't occur in the general */
-            /*  case of the copy in/out protocol because when both sender */
-            /*  and receiver agree on the copy in/out protoocol we eagerly */
-            /*  send data, we don't update the request with this eagerly sent */
-            /*  data until here in this function, so completion could not have */
-            /*  yet occurred. */
-            MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq);
-        }
-    }
+    OPAL_THREAD_ADD32(&sendreq->req_state, -1);
+
+    send_request_pml_complete_check(sendreq);
+
    /* check for pending requests */
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 }
@ -306,10 +288,9 @@ mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
    /* count bytes of user data actually delivered and check for request completion */
    MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
                                        0, req_bytes_delivered );
-    if( OPAL_THREAD_ADD_SIZE_T( &sendreq->req_bytes_delivered, req_bytes_delivered )
-        == sendreq->req_send.req_bytes_packed ) {
-        MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq);
-    }
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
+
+    send_request_pml_complete_check(sendreq);

    /* release resources */
    btl->btl_free(btl,des);
@ -368,12 +349,10 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
    /* return the descriptor */
    mca_bml_base_free(bml_btl, descriptor);

-    if(OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered,
-                req_bytes_delivered) == sendreq->req_send.req_bytes_packed) {
-        MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq);
-    } else {
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
+
+    if(send_request_pml_complete_check(sendreq) == false)
        mca_pml_ob1_send_request_schedule(sendreq);
-    }

    /* check for pending requests */
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -473,6 +452,10 @@ int mca_pml_ob1_send_request_start_buffered(
                                     MPI_BYTE,
                                     sendreq->req_send.req_bytes_packed,
                                     sendreq->req_send.req_addr );
+   
+    /* wait for ack and completion */
+    sendreq->req_state = 2;
+
    /* request is complete at mpi level */
    OPAL_THREAD_LOCK(&ompi_request_lock);
    MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);
@ -790,6 +773,9 @@ int mca_pml_ob1_send_request_start_rdma(
    
        /* first fragment of a long message */
        des->des_cbfunc = mca_pml_ob1_rndv_completion;
+
+        /* wait for ack and completion */
+        sendreq->req_state = 2;
    }

    des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
@ -870,6 +856,9 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
    des->des_cbdata = sendreq;
    des->des_cbfunc = mca_pml_ob1_rndv_completion;

+    /* wait for ack and completion */
+    sendreq->req_state = 2;
+
    /* send */
    rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML);
    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@ -926,7 +915,6 @@ void mca_pml_ob1_send_request_copy_in_out( mca_pml_ob1_send_request_t *sendreq,
 int mca_pml_ob1_send_request_schedule_exclusive(
        mca_pml_ob1_send_request_t* sendreq)
 { 
-
    do {
        size_t prev_bytes_remaining = 0;
        mca_pml_ob1_send_range_t *range = NULL;
@ -1060,16 +1048,9 @@ int mca_pml_ob1_send_request_schedule_exclusive(
            }
            mca_bml.bml_progress();
        }
-        OPAL_THREAD_LOCK(&ompi_request_lock);
-        if(sendreq->req_send.req_base.req_free_called &&
-                sendreq->req_send.req_base.req_pml_complete) {
-            /* if request already completed and freed put it on a free list */
-            MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq );
-            OPAL_THREAD_UNLOCK(&ompi_request_lock);
-            return MPI_SUCCESS;
-        }
-        OPAL_THREAD_UNLOCK(&ompi_request_lock);
-    } while (OPAL_THREAD_ADD32(&sendreq->req_lock,-1) > 0);
+    } while (OPAL_THREAD_ADD32(&sendreq->req_lock, -1) > 0);
+
+    send_request_pml_complete_check(sendreq);

    return OMPI_SUCCESS;
 } 
@ -1103,14 +1084,9 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
                         des->order, 0);
    
    /* check for request completion */
-    if( OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length)
-        >= sendreq->req_send.req_bytes_packed) {
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);

-        /* if we've got completion on rndv packet */
-        if (sendreq->req_state == 2) {
-            MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq);
-        }
-    }
+    send_request_pml_complete_check(sendreq);

    MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
    /* return rdma descriptor - do this after queuing the fin message - as 
@ -1209,7 +1185,7 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
    size_t i, size = 0;

    if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) { 
-        OPAL_THREAD_ADD32(&sendreq->req_state, 1);
+        OPAL_THREAD_ADD32(&sendreq->req_state, -1);
    }

    MCA_PML_OB1_RDMA_FRAG_ALLOC(frag, rc); 
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@ -42,13 +42,8 @@ struct mca_pml_ob1_send_request_t {
    mca_pml_base_send_request_t req_send;
    mca_bml_base_endpoint_t* req_endpoint;
    ompi_ptr_t req_recv;
-#if OMPI_HAVE_THREAD_SUPPORT
-    volatile int32_t req_state;
-    volatile int32_t req_lock;
-#else
    int32_t req_state;
    int32_t req_lock;
-#endif
    bool req_throttle_sends;
    size_t req_pipeline_depth;
    size_t req_bytes_delivered;
@ -158,46 +153,78 @@ do {
                            &(sendreq->req_send.req_base), PERUSE_SEND);          \
 } while(0)

+/*
+ * Release resources associated with a request
+ */
+
+#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq)                        \
+    do {                                                                \
+    /*  Let the base handle the reference counts */                     \
+    MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send));             \
+    OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests,                 \
+                           (ompi_free_list_item_t*)sendreq);            \
+    } while(0)
+
+
 /*
 * The PML has completed a send request. Note that this request
 * may have been orphaned by the user or have already completed
 * at the MPI level.
- * This macro will never be called directly from the upper level, as it should
- * only be an internal call to the PML.
+ * This function will never be called directly from the upper level, as it
+ * should only be an internal call to the PML.
+ *
 */
+void static inline
+send_request_pml_complete(mca_pml_ob1_send_request_t *sendreq)
+{
+    assert(false == sendreq->req_send.req_base.req_pml_complete);
+
+    if(sendreq->req_send.req_bytes_packed > 0) {
+        PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
+                &(sendreq->req_send.req_base), PERUSE_SEND);
+    }
+
+    /* return mpool resources */
+    mca_pml_ob1_free_rdma_resources(sendreq);
+
+    if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
+            sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
+            mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
+    }
+
+    OPAL_THREAD_LOCK(&ompi_request_lock);
+    if(false == sendreq->req_send.req_base.req_ompi.req_complete) {
+        /* Should only be called for long messages (maybe synchronous) */
+        MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);
+    }
+    sendreq->req_send.req_base.req_pml_complete = true;
+
+    if(sendreq->req_send.req_base.req_free_called) {
+            MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq);
+    }
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+}
+
+/* returns true if request was completed on PML level */
+bool static inline
+send_request_pml_complete_check(mca_pml_ob1_send_request_t *sendreq)
+{
+    opal_atomic_rmb();
+    /* if no more events are expected for the request and the whole message is
+     * already sent and send fragment scheduling isn't running in another
+     * thread then complete the request on PML level. From now on, if user
+     * called free on this request, the request structure can be reused for
+     * another request or if the request is persistent it can be restarted */
+    if(sendreq->req_state == 0 &&
+            sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
+            && OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1) {
+        send_request_pml_complete(sendreq);
+        return true;
+    }
+
+    return false;
+}

-#define MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq)                                  \
-    do {                                                                                \
-        assert( false == sendreq->req_send.req_base.req_pml_complete );                 \
-                                                                                        \
-        if( sendreq->req_send.req_bytes_packed > 0 ) {                                  \
-            PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,                          \
-                                     &(sendreq->req_send.req_base),                     \
-                                     PERUSE_SEND );                                     \
-        }                                                                               \
-                                                                                        \
-        /* return mpool resources */                                                    \
-        mca_pml_ob1_free_rdma_resources(sendreq);                                       \
-                                                                                        \
-        if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&            \
-            sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {        \
-            mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);                  \
-        }                                                                               \
-                                                                                        \
-        OPAL_THREAD_LOCK(&ompi_request_lock);                                           \
-        if( false == sendreq->req_send.req_base.req_ompi.req_complete ) {               \
-            /* Should only be called for long messages (maybe synchronous) */           \
-            MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq);                             \
-        }                                                                               \
-        sendreq->req_send.req_base.req_pml_complete = true;                             \
-                                                                                        \
-        if( sendreq->req_send.req_base.req_free_called ) {                              \
-            /* don't free request if other thread running schedule */                   \
-            if(OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1)                           \
-                MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq );                             \
-        }                                                                               \
-        OPAL_THREAD_UNLOCK(&ompi_request_lock);                                         \
-    } while (0)

 /**
 *  Schedule additional fragments 
@ -219,18 +246,6 @@ static inline void mca_pml_ob1_send_request_schedule(
        mca_pml_ob1_send_request_schedule_exclusive(sendreq);
 }

-/*
- * Release resources associated with a request
- */
-
-#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq)                        \
-    do {                                                                \
-    /*  Let the base handle the reference counts */                     \
-    MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send));             \
-    OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests,                 \
-                           (ompi_free_list_item_t*)sendreq);            \
-    } while(0)
-
 /**
 *  Start the specified request
 */
@ -337,7 +352,7 @@ mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq )

    sendreq->req_endpoint = endpoint;
    sendreq->req_state = 0;
-    sendreq->req_lock=0 ;
+    sendreq->req_lock = 0;
    sendreq->req_pipeline_depth = 0;
    sendreq->req_bytes_delivered = 0;
    sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE;