pml/yalla: fix yalla performance regression

It was introduced in PR https://github.com/open-mpi/ompi/pull/1228 in particular in commit 041a6a9f53033a12d1cbf5c1af36cb16c7cdcc36. Original solution was using "flexible array member" called "mxm_base" to "fall-through" to the "mxm" send/recv member that located in the outer structure. After changing number of elements in "mxm_base" from 0 to 1 we actually allocating 2 mxm_req_base_t elements which leads to increased overal size and harms cache performance. It also brakes "mca_pml_yalla_check_request_state" function.
2016-07-01 18:02:13 +03:00 · 2016-07-01 18:02:13 +03:00 · 7d96f12fec
--- a/ompi/mca/pml/yalla/pml_yalla.c
+++ b/ompi/mca/pml/yalla/pml_yalla.c
@ -688,6 +688,7 @@ int mca_pml_yalla_start(size_t count, ompi_request_t** requests)

    for (i = 0; i < count; ++i) {
        req = (mca_pml_yalla_base_request_t *)requests[i];
+        sreq = (mca_pml_yalla_send_request_t *)req;

        if ((req == NULL) || (OMPI_REQUEST_PML != req->ompi.req_type)) {
            /* Skip irrelevant requests */
@ -696,10 +697,9 @@ int mca_pml_yalla_start(size_t count, ompi_request_t** requests)

        PML_YALLA_ASSERT(req->ompi.req_state != OMPI_REQUEST_INVALID);
        PML_YALLA_RESET_OMPI_REQ(&req->ompi, OMPI_REQUEST_ACTIVE);
-        PML_YALLA_RESET_PML_REQ(req);
+        PML_YALLA_RESET_PML_REQ(req, PML_YALLA_MXM_REQBASE(sreq));

        if (req->flags & MCA_PML_YALLA_REQUEST_FLAG_SEND) {
-            sreq = (mca_pml_yalla_send_request_t *)req;
            if (req->flags & MCA_PML_YALLA_REQUEST_FLAG_BSEND) {
                PML_YALLA_VERBOSE(8, "start bsend request %p", (void *)sreq);
                rc = mca_pml_yalla_bsend(&sreq->mxm);
--- a/ompi/mca/pml/yalla/pml_yalla_request.c
+++ b/ompi/mca/pml/yalla/pml_yalla_request.c
@ -32,10 +32,10 @@ static inline void mca_pml_yalla_request_release(mca_pml_yalla_base_request_t *r
 }

 static inline int
-mca_pml_yalla_check_request_state(mca_pml_yalla_base_request_t *req)
+mca_pml_yalla_check_request_state(mca_pml_yalla_base_request_t *req, mxm_req_base_t *mxm_base)
 {
-    if (req->mxm_base->state != MXM_REQ_COMPLETED) {
-         PML_YALLA_VERBOSE(8, "request %p free called before completed", (void *)req);
+    if ( mxm_base->state != MXM_REQ_COMPLETED) {
+         PML_YALLA_VERBOSE(8, "request %p free called before completed", (void*)req);
         req->flags |= MCA_PML_YALLA_REQUEST_FLAG_FREE_CALLED;
         return 0;
    }
@ -45,11 +45,12 @@ mca_pml_yalla_check_request_state(mca_pml_yalla_base_request_t *req)

 static int mca_pml_yalla_send_request_free(ompi_request_t **request)
 {
-    mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)(*request);
+    mca_pml_yalla_send_request_t *sreq = (mca_pml_yalla_send_request_t*)(*request);
+    mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)sreq;

    PML_YALLA_VERBOSE(9, "free send request *%p=%p", (void *)request, (void *)*request);

-    if (mca_pml_yalla_check_request_state(req)) {
+    if (mca_pml_yalla_check_request_state(req, PML_YALLA_MXM_REQBASE(sreq))) {
        mca_pml_yalla_request_release(req, &ompi_pml_yalla.send_reqs);
    }

@ -84,11 +85,12 @@ static int mca_pml_yalla_send_request_cancel(ompi_request_t *request, int flag)

 static int mca_pml_yalla_recv_request_free(ompi_request_t **request)
 {
-    mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)(*request);
+    mca_pml_yalla_recv_request_t *rreq = (mca_pml_yalla_recv_request_t*)(*request);
+    mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)rreq;

    PML_YALLA_VERBOSE(9, "free receive request *%p=%p", (void *)request, (void *)*request);

-    if (mca_pml_yalla_check_request_state(req)) {
+    if (mca_pml_yalla_check_request_state(req, PML_YALLA_MXM_REQBASE(rreq))) {
        mca_pml_yalla_request_release(req, &ompi_pml_yalla.recv_reqs);
    }

--- a/ompi/mca/pml/yalla/pml_yalla_request.h
+++ b/ompi/mca/pml/yalla/pml_yalla_request.h
@ -25,15 +25,6 @@ struct pml_yalla_base_request {
    ompi_request_t               ompi;
    mca_pml_yalla_convertor_t    *convertor;
    int                          flags;
-    /* overlaps with base of send/recv
-     * In ISO C90, you would have to give contents a length of 1,
-     * which means either you waste space or complicate the argument to malloc.
-     * Note:
-     *  - 1 was the portable way to go, though it was rather strange
-     *  - 0 was better at indicating intent, but not legal as far as
-     *  the Standard was concerned and supported as an extension by some compilers (including gcc)
-     */
-    mxm_req_base_t               mxm_base[1];
 };

 struct pml_yalla_send_request {
@ -58,6 +49,8 @@ OBJ_CLASS_DECLARATION(mca_pml_yalla_recv_request_t);

 void mca_pml_yalla_init_reqs(void);

+#define PML_YALLA_MXM_REQBASE( x ) ( &((x)->mxm.base) )
+
 #define PML_YALLA_RESET_OMPI_REQ(_ompi_req, _state) \
    { \
        (_ompi_req)->req_state = _state; \
@ -72,9 +65,9 @@ void mca_pml_yalla_init_reqs(void);
        OBJ_RETAIN(_comm); \
    }

-#define PML_YALLA_RESET_PML_REQ(_pml_req) \
+#define PML_YALLA_RESET_PML_REQ(_pml_req, mxm_base) \
    { \
-        (_pml_req)->mxm_base[0].state = MXM_REQ_NEW; \
+        mxm_base->state = MXM_REQ_NEW; \
        PML_YALLA_RESET_PML_REQ_DATA(_pml_req); \
    }