From e968ddfe641fec7e3a350e0cd38c4e581cc314b7 Mon Sep 17 00:00:00 2001
From: Nathan Hjelm <hjelmn@cs.unm.edu>
Date: Thu, 2 Jun 2016 18:22:40 -0600
Subject: [PATCH] start bug fixes (#1729)

* mpi/start: fix bugs in cm and ob1 start functions

There were several problems with the implementation of start in Open
MPI:

 - There are no checks whatsoever on the state of the request(s)
   provided to MPI_Start/MPI_Start_all. It is erroneous to provide an
   active request to either of these calls. Since we are already
   looping over the provided requests there is little overhead in
   verifying that the request can be started.

 - Both ob1 and cm were always throwing away the request on the
   initial call to start and start_all with a particular
   request. Subsequent calls would see that the request was
   pml_complete and reuse it. This introduced a leak as the initial
   request was never freed. Since the only pml request that can
   be mpi complete but not pml complete is a buffered send the
   code to reallocate the request has been moved. To detect that
   a request is indeed mpi complete but not pml complete isend_init
   in both cm and ob1 now marks the new request as pml complete.

 - If a new request was needed the callbacks on the original request
   were not copied over to the new request. This can cause osc/pt2pt
   to hang as the incoming message callback is never called.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>

* osc/pt2pt: add request for gc after starting a new request

Starting a new receive may cause a recursive call into the pt2pt
frag receive function. If this happens and the prior request is
on the garbage collection list it could cause problems. This commit
moves the gc insert until after the new request has been posted.

Signed-off-by: Nathan Hjelm <hjelmn@me.com>
---
 ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c |  11 ++-
 ompi/mca/osc/pt2pt/osc_pt2pt_module.c    |   5 +-
 ompi/mca/pml/cm/pml_cm.h                 |   6 ++
 ompi/mca/pml/cm/pml_cm_start.c           | 108 ++++++++--------------
 ompi/mca/pml/ob1/pml_ob1_isend.c         |   6 ++
 ompi/mca/pml/ob1/pml_ob1_start.c         | 113 +++++++++--------------
 ompi/mpi/c/startall.c                    |   1 +
 7 files changed, 107 insertions(+), 143 deletions(-)

diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c
index ec2e9e36ee..67365eb3f5 100644
--- a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c
+++ b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c
@@ -1661,9 +1661,10 @@ static int ompi_osc_pt2pt_callback (ompi_request_t *request)
 
     osc_pt2pt_gc_clean (module);
 
+    ompi_osc_pt2pt_frag_start_receive (module);
+
     /* put this request on the garbage colletion list */
     osc_pt2pt_gc_add_request (module, request);
-    ompi_osc_pt2pt_frag_start_receive (module);
 
     OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                          "finished posting receive request"));
@@ -1673,6 +1674,7 @@ static int ompi_osc_pt2pt_callback (ompi_request_t *request)
 
 int ompi_osc_pt2pt_frag_start_receive (ompi_osc_pt2pt_module_t *module)
 {
+    module->frag_request = MPI_REQUEST_NULL;
     return ompi_osc_pt2pt_irecv_w_cb (module->incoming_buffer, mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t),
                                      MPI_BYTE, OMPI_ANY_SOURCE, OSC_PT2PT_FRAG_TAG, module->comm, &module->frag_request,
                                      ompi_osc_pt2pt_callback, module);
@@ -1732,11 +1734,14 @@ int ompi_osc_pt2pt_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype,
 
     request->req_complete_cb = cb;
     request->req_complete_cb_data = ctx;
-    if (request_out) {
+
+    ret = MCA_PML_CALL(start(1, &request));
+    if (request_out && MPI_REQUEST_NULL != request) {
         *request_out = request;
     }
 
-    ret = MCA_PML_CALL(start(1, &request));
+    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
+                         "osc pt2pt: pml start returned %d. state: %d", ret, request->req_state));
 
     return ret;
 }
diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_module.c b/ompi/mca/osc/pt2pt/osc_pt2pt_module.c
index 3d8460b7a6..56c0d4bc0e 100644
--- a/ompi/mca/osc/pt2pt/osc_pt2pt_module.c
+++ b/ompi/mca/osc/pt2pt/osc_pt2pt_module.c
@@ -8,7 +8,7 @@
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
- * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2007-2016 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * Copyright (c) 2012-2013 Sandia National Laboratories.  All rights reserved.
  * Copyright (c) 2015      Research Organization for Information Science
@@ -95,11 +95,12 @@ int ompi_osc_pt2pt_free(ompi_win_t *win)
 
     if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
 
-    if (NULL != module->frag_request) {
+    if (NULL != module->frag_request && MPI_REQUEST_NULL != module->frag_request) {
         module->frag_request->req_complete_cb = NULL;
         ompi_request_cancel (module->frag_request);
         ompi_request_free (&module->frag_request);
     }
+
     if (NULL != module->comm) {
         ompi_comm_free(&module->comm);
     }
diff --git a/ompi/mca/pml/cm/pml_cm.h b/ompi/mca/pml/cm/pml_cm.h
index db04c7a77f..ba055c474e 100644
--- a/ompi/mca/pml/cm/pml_cm.h
+++ b/ompi/mca/pml/cm/pml_cm.h
@@ -232,6 +232,12 @@ mca_pml_cm_isend_init(const void* buf,
     MCA_PML_CM_HVY_SEND_REQUEST_INIT(sendreq, ompi_proc, comm, tag, dst,
                                      datatype, sendmode, true, false, buf, count);
 
+    /* Work around a leak in start by marking this request as complete. The
+     * problem occured because we do not have a way to differentiate an
+     * inital request and an incomplete pml request in start. This line
+     * allows us to detect this state. */
+    sendreq->req_send.req_base.req_pml_complete = true;
+
     *request = (ompi_request_t*) sendreq;
 
     return OMPI_SUCCESS;
diff --git a/ompi/mca/pml/cm/pml_cm_start.c b/ompi/mca/pml/cm/pml_cm_start.c
index f07c7cf059..a3084090ea 100644
--- a/ompi/mca/pml/cm/pml_cm_start.c
+++ b/ompi/mca/pml/cm/pml_cm_start.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -10,6 +11,8 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2016      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -32,78 +35,14 @@ int
 mca_pml_cm_start(size_t count, ompi_request_t** requests)
 {
     int rc;
-    size_t i;
-    for (i = 0 ; i < count ; i++) {
-        mca_pml_cm_request_t *pml_request =
-            (mca_pml_cm_request_t*)requests[i];
-        if (OMPI_REQUEST_PML != requests[i]->req_type) {
+
+    for (size_t i = 0 ; i < count ; i++) {
+        mca_pml_cm_request_t *pml_request = (mca_pml_cm_request_t*)requests[i];
+        if (OMPI_REQUEST_PML != requests[i]->req_type || NULL == pml_request) {
             continue;
         }
-        if (NULL == pml_request) {
-            continue;
-        }
-        /* If the persistent request is currebtly active - obtain the
-         * request lock and verify the status is incomplete. if the
-         * pml layer has not completed the request - mark the request
-         * as free called - so that it will be freed when the request
-         * completes - and create a new request.
-         */
-        switch (pml_request->req_ompi.req_state) {
-        case OMPI_REQUEST_INACTIVE:
-            if (pml_request->req_pml_complete == true)
-                break;
 
-        case OMPI_REQUEST_ACTIVE: {
-            /* otherwise fall through */
-            ompi_request_t *request;
-
-            OPAL_THREAD_LOCK(&ompi_request_lock);
-            if (pml_request->req_pml_complete == false) {
-                /* free request after it completes */
-                pml_request->req_free_called = true;
-            } else {
-                /* can reuse the existing request */
-                OPAL_THREAD_UNLOCK(&ompi_request_lock);
-                break;
-            }
-
-            /* allocate a new request */
-            switch (pml_request->req_pml_type) {
-            case MCA_PML_CM_REQUEST_SEND_HEAVY: {
-                mca_pml_cm_hvy_send_request_t* sendreq = (mca_pml_cm_hvy_send_request_t*) pml_request;
-                rc = mca_pml_cm_isend_init( sendreq->req_addr,
-                                            sendreq->req_count,
-                                            sendreq->req_send.req_base.req_datatype,
-                                            sendreq->req_peer,
-                                            sendreq->req_tag,
-                                            sendreq->req_send.req_send_mode,
-                                            sendreq->req_send.req_base.req_comm,
-                                            &request );
-                break;
-            }
-            case MCA_PML_CM_REQUEST_RECV_HEAVY: {
-                mca_pml_cm_hvy_recv_request_t* recvreq = (mca_pml_cm_hvy_recv_request_t*) pml_request;
-                rc = mca_pml_cm_irecv_init( recvreq->req_addr,
-                                            recvreq->req_count,
-                                            recvreq->req_base.req_datatype,
-                                            recvreq->req_peer,
-                                            recvreq->req_tag,
-                                            recvreq->req_base.req_comm,
-                                            &request );
-                break;
-            }
-            default:
-                rc = OMPI_ERR_REQUEST;
-                break;
-            }
-            OPAL_THREAD_UNLOCK(&ompi_request_lock);
-            if(OMPI_SUCCESS != rc)
-                return rc;
-            pml_request = (mca_pml_cm_request_t*)request;
-            requests[i] = request;
-            break;
-        }
-        default:
+        if (OMPI_REQUEST_ACTIVE == pml_request->req_ompi.req_state) {
             return OMPI_ERR_REQUEST;
         }
 
@@ -113,6 +52,37 @@ mca_pml_cm_start(size_t count, ompi_request_t** requests)
             {
                 mca_pml_cm_hvy_send_request_t* sendreq =
                     (mca_pml_cm_hvy_send_request_t*)pml_request;
+                if (!sendreq->req_send.req_base.req_pml_complete) {
+                    ompi_request_t *request;
+
+                    /* buffered sends can be mpi complete and pml incomplete. to support this
+                     * case we need to allocate a new request. */
+                    rc = mca_pml_cm_isend_init (sendreq->req_addr,
+                                                sendreq->req_count,
+                                                sendreq->req_send.req_base.req_datatype,
+                                                sendreq->req_peer,
+                                                sendreq->req_tag,
+                                                sendreq->req_send.req_send_mode,
+                                                sendreq->req_send.req_base.req_comm,
+                                                &request);
+                    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                        return rc;
+                    }
+
+                    /* copy the callback and callback data to the new requests */
+                    request->req_complete_cb = pml_request->req_ompi.req_complete_cb;
+                    request->req_complete_cb_data = pml_request->req_ompi.req_complete_cb_data;
+
+                    /* ensure the old request gets released */
+                    pml_request->req_free_called = true;
+
+                    sendreq = (mca_pml_cm_hvy_send_request_t *) request;
+                    requests[i] = request;
+                }
+
+                /* reset the completion flag */
+                sendreq->req_send.req_base.req_pml_complete = false;
+
                 MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, rc);
                 if(rc != OMPI_SUCCESS)
                     return rc;
diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c
index 723e213f13..90edc34e18 100644
--- a/ompi/mca/pml/ob1/pml_ob1_isend.c
+++ b/ompi/mca/pml/ob1/pml_ob1_isend.c
@@ -63,6 +63,12 @@ int mca_pml_ob1_isend_init(const void *buf,
                              &(sendreq)->req_send.req_base,
                              PERUSE_SEND);
 
+    /* Work around a leak in start by marking this request as complete. The
+     * problem occured because we do not have a way to differentiate an
+     * inital request and an incomplete pml request in start. This line
+     * allows us to detect this state. */
+    sendreq->req_send.req_base.req_pml_complete = true;
+
     *request = (ompi_request_t *) sendreq;
     return OMPI_SUCCESS;
 }
diff --git a/ompi/mca/pml/ob1/pml_ob1_start.c b/ompi/mca/pml/ob1/pml_ob1_start.c
index fb8a37bb4a..172be7980b 100644
--- a/ompi/mca/pml/ob1/pml_ob1_start.c
+++ b/ompi/mca/pml/ob1/pml_ob1_start.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -11,6 +12,8 @@
  *                         All rights reserved.
  * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * Copyright (c) 2016      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -29,84 +32,25 @@
 int mca_pml_ob1_start(size_t count, ompi_request_t** requests)
 {
     int rc;
-    size_t i;
-    bool reuse_old_request = true;
 
-    for(i=0; i<count; i++) {
+    for (size_t i = 0 ; i < count ; ++i) {
         mca_pml_base_request_t *pml_request = (mca_pml_base_request_t*)requests[i];
-        if(NULL == pml_request) {
-            continue;
-        }
-        if (OMPI_REQUEST_PML != requests[i]->req_type) {
+        if (NULL == pml_request || OMPI_REQUEST_PML != requests[i]->req_type) {
             continue;
         }
 
-        /* If the persistent request is currently active - obtain the
-         * request lock and verify the status is incomplete. if the
-         * pml layer has not completed the request - mark the request
-         * as free called - so that it will be freed when the request
+        /* If the persistent request is currently active - verify the status
+         * is incomplete. if the pml layer has not completed the request - mark
+         * the request as free called - so that it will be freed when the request
          * completes - and create a new request.
          */
 
 #if OPAL_ENABLE_MULTI_THREADS
         opal_atomic_rmb();
 #endif
-        reuse_old_request = true;
-        switch(pml_request->req_ompi.req_state) {
-            case OMPI_REQUEST_INACTIVE:
-                if(pml_request->req_pml_complete == true)
-                    break;
-                /* otherwise fall through */
-            case OMPI_REQUEST_ACTIVE: {
 
-                ompi_request_t *request;
-                if (pml_request->req_pml_complete == false) {
-                    /* free request after it completes */
-                    pml_request->req_free_called = true;
-                } else {
-                    /* can reuse the existing request */
-                    break;
-                }
-
-                reuse_old_request = false;
-                /* allocate a new request */
-                switch(pml_request->req_type) {
-                    case MCA_PML_REQUEST_SEND: {
-                         mca_pml_base_send_mode_t sendmode =
-                             ((mca_pml_base_send_request_t*)pml_request)->req_send_mode;
-                         rc = mca_pml_ob1_isend_init(
-                              pml_request->req_addr,
-                              pml_request->req_count,
-                              pml_request->req_datatype,
-                              pml_request->req_peer,
-                              pml_request->req_tag,
-                              sendmode,
-                              pml_request->req_comm,
-                              &request);
-                         break;
-                    }
-                    case MCA_PML_REQUEST_RECV:
-                         rc = mca_pml_ob1_irecv_init(
-                              pml_request->req_addr,
-                              pml_request->req_count,
-                              pml_request->req_datatype,
-                              pml_request->req_peer,
-                              pml_request->req_tag,
-                              pml_request->req_comm,
-                              &request);
-                         break;
-                    default:
-                         rc = OMPI_ERR_REQUEST;
-                         break;
-                }
-                if(OMPI_SUCCESS != rc)
-                    return rc;
-                pml_request = (mca_pml_base_request_t*)request;
-                requests[i] = request;
-                break;
-            }
-            default:
-                return OMPI_ERR_REQUEST;
+        if (OMPI_REQUEST_ACTIVE == pml_request->req_ompi.req_state) {
+            return OMPI_ERR_REQUEST;
         }
 
         /* start the request */
@@ -119,15 +63,46 @@ int mca_pml_ob1_start(size_t count, ompi_request_t** requests)
                                     pml_request->req_addr, pml_request->req_count,
                                     pml_request->req_datatype);
                 );
-                if( reuse_old_request && (sendreq->req_send.req_bytes_packed != 0) ) {
+
+                if (!pml_request->req_pml_complete) {
+                    ompi_request_t *request;
+
+                    /* buffered sends can be mpi complete and pml incomplete. to support this
+                     * case we need to allocate a new request. */
+                    rc = mca_pml_ob1_isend_init (pml_request->req_addr,
+                                                 pml_request->req_count,
+                                                 pml_request->req_datatype,
+                                                 pml_request->req_peer,
+                                                 pml_request->req_tag,
+                                                 sendreq->req_send.req_send_mode,
+                                                 pml_request->req_comm,
+                                                 &request);
+                    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                        return rc;
+                    }
+
+                    /* copy the callback and callback data to the new requests */
+                    request->req_complete_cb = pml_request->req_ompi.req_complete_cb;
+                    request->req_complete_cb_data = pml_request->req_ompi.req_complete_cb_data;
+
+                    /* ensure the old request gets released */
+                    pml_request->req_free_called = true;
+
+                    sendreq = (mca_pml_ob1_send_request_t *) request;
+                    requests[i] = request;
+                } else if (sendreq->req_send.req_bytes_packed != 0) {
                     size_t offset = 0;
                     /**
                      * Reset the convertor in case we're dealing with the original
                      * request, which when completed do not reset the convertor.
                      */
-                    opal_convertor_set_position( &sendreq->req_send.req_base.req_convertor,
-                                                 &offset );
+                    opal_convertor_set_position (&sendreq->req_send.req_base.req_convertor,
+                                                 &offset);
                 }
+
+                /* reset the completion flag */
+                pml_request->req_pml_complete = false;
+
                 MCA_PML_OB1_SEND_REQUEST_START(sendreq, rc);
                 if(rc != OMPI_SUCCESS)
                     return rc;
diff --git a/ompi/mpi/c/startall.c b/ompi/mpi/c/startall.c
index 38b5644451..e1718e5e4d 100644
--- a/ompi/mpi/c/startall.c
+++ b/ompi/mpi/c/startall.c
@@ -88,6 +88,7 @@ int MPI_Startall(int count, MPI_Request requests[])
             }
         }
     }
+
     ret = MCA_PML_CALL(start(count, requests));
 
     OPAL_CR_EXIT_LIBRARY();