diff --git a/ompi/mca/mtl/portals4/mtl_portals4_component.c b/ompi/mca/mtl/portals4/mtl_portals4_component.c index 771bf6d72d..7db67d797a 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_component.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_component.c @@ -471,6 +471,7 @@ ompi_mtl_portals4_progress(void) case PTL_EVENT_AUTO_FREE: case PTL_EVENT_AUTO_UNLINK: case PTL_EVENT_SEARCH: + case PTL_EVENT_LINK: if (NULL != ev.user_ptr) { ptl_request = ev.user_ptr; ret = ptl_request->event_callback(&ev, ptl_request); @@ -497,14 +498,13 @@ ompi_mtl_portals4_progress(void) #endif break; - case PTL_EVENT_LINK: case PTL_EVENT_GET_OVERFLOW: case PTL_EVENT_FETCH_ATOMIC: case PTL_EVENT_FETCH_ATOMIC_OVERFLOW: case PTL_EVENT_ATOMIC: case PTL_EVENT_ATOMIC_OVERFLOW: - opal_output_verbose(1, ompi_mtl_base_output, - "Unexpected event of type %d", ev.type); + opal_output(ompi_mtl_base_output, + "Unexpected event of type %d", ev.type); } } else if (PTL_EQ_EMPTY == ret) { break; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index 54dda3115c..5378014306 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -44,6 +44,10 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, (ompi_mtl_portals4_recv_request_t*) ptl_base_request; size_t msg_length = 0; + /* as soon as we've seen any event associated with a request, it's + started */ + ptl_request->req_started = true; + switch (ev->type) { case PTL_EVENT_PUT: OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) got put event", @@ -295,6 +299,9 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, break; + case PTL_EVENT_LINK: + break; + default: opal_output_verbose(1, ompi_mtl_base_output, "Unhandled receive callback with event type %d", @@ -358,6 +365,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, ptl_request->convertor = convertor; ptl_request->delivery_ptr = start; ptl_request->delivery_len = length; + ptl_request->req_started = false; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, @@ -374,8 +382,10 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, me.options = PTL_ME_OP_PUT | PTL_ME_USE_ONCE | - PTL_ME_EVENT_LINK_DISABLE | /* BWB: FIX ME */ PTL_ME_EVENT_UNLINK_DISABLE; + if (length <= ompi_mtl_portals4.eager_limit) { + me.options |= PTL_ME_EVENT_LINK_DISABLE; + } me.match_id = remote_proc; me.match_bits = match_bits; me.ignore_bits = ignore_bits; @@ -394,6 +404,16 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, return ompi_mtl_portals4_get_error(ret); } + /* if a long message, spin until we either have a comm event or a + link event, guaranteeing progress for long unexpected + messages. */ + if (length > ompi_mtl_portals4.eager_limit) { + while (true != ptl_request->req_started) { + ompi_mtl_portals4_progress(); + opal_atomic_rmb(); + } + } + return OMPI_SUCCESS; } diff --git a/ompi/mca/mtl/portals4/mtl_portals4_request.h b/ompi/mca/mtl/portals4/mtl_portals4_request.h index 359ddefac7..1287fbdb87 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_request.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_request.h @@ -68,6 +68,7 @@ struct ompi_mtl_portals4_recv_request_t { struct opal_convertor_t *convertor; void *delivery_ptr; size_t delivery_len; + volatile bool req_started; #if OPAL_ENABLE_DEBUG int opcount; ptl_hdr_data_t hdr_data;