diff --git a/ompi/mca/mtl/portals4/mtl_portals4.c b/ompi/mca/mtl/portals4/mtl_portals4.c index 41a9a6d665..449653ca6c 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4.c +++ b/ompi/mca/mtl/portals4/mtl_portals4.c @@ -178,6 +178,7 @@ portals4_init_interface(void) me.ignore_bits = MTL_PORTALS4_CONTEXT_MASK | MTL_PORTALS4_SOURCE_MASK | MTL_PORTALS4_TAG_MASK; + ret = PtlMEAppend(ompi_mtl_portals4.ni_h, ompi_mtl_portals4.recv_idx, &me, diff --git a/ompi/mca/mtl/portals4/mtl_portals4.h b/ompi/mca/mtl/portals4/mtl_portals4.h index 2cde423a31..731e60188b 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4.h +++ b/ompi/mca/mtl/portals4/mtl_portals4.h @@ -202,18 +202,26 @@ extern mca_mtl_portals4_module_t ompi_mtl_portals4; ((int)((match_bits & MTL_PORTALS4_SOURCE_MASK) >> 24)) +/* hda_data bit manipulation + * + * 0 1234567 01234567 01234567 0123 4567 01234567 01234567 01234567 01234567 + * | | | + * ^| | context id | message tag + * || | | + * +---- is_sync + */ + #define MTL_PORTALS4_SYNC_MSG 0x8000000000000000ULL -#define MTL_PORTALS4_SET_HDR_DATA(hdr_data, opcount, length, sync) \ +#define MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, contextid, sync) \ { \ hdr_data = (sync) ? 1 : 0; \ - hdr_data = (hdr_data << 15); \ - hdr_data |= opcount & 0x7FFFULL; \ - hdr_data = (hdr_data << 48); \ - hdr_data |= (length & 0xFFFFFFFFFFFFULL); \ + hdr_data = (hdr_data << 39); \ + hdr_data |= contextid; \ + hdr_data = (hdr_data << 24); \ + hdr_data |= (MTL_PORTALS4_TAG_MASK & tag); \ } -#define MTL_PORTALS4_GET_LENGTH(hdr_data) ((size_t)(hdr_data & 0xFFFFFFFFFFFFULL)) #define MTL_PORTALS4_IS_SYNC_MSG(hdr_data) \ (0 != (MTL_PORTALS4_SYNC_MSG & hdr_data)) diff --git a/ompi/mca/mtl/portals4/mtl_portals4_probe.c b/ompi/mca/mtl/portals4/mtl_portals4_probe.c index fbeda2124e..a87f72087e 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_probe.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_probe.c @@ -41,7 +41,7 @@ completion_fn(ptl_event_t *ev, ompi_mtl_portals4_base_request_t *ptl_base_reques ptl_request->status.MPI_SOURCE = MTL_PORTALS4_GET_SOURCE(ev->match_bits); ptl_request->status.MPI_TAG = MTL_PORTALS4_GET_TAG(ev->match_bits); ptl_request->status.MPI_ERROR = MPI_SUCCESS; - ptl_request->status._ucount = MTL_PORTALS4_GET_LENGTH(ev->hdr_data); + ptl_request->status._ucount += ev->mlength; if (ev->type != PTL_EVENT_SEARCH) { ptl_request->message = ompi_mtl_portals4_message_alloc(ev); } diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index 448372917d..b40edef041 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -34,6 +34,40 @@ #include "mtl_portals4_recv_short.h" #include "mtl_portals4_message.h" +static int +triggered_read_msg(void *start, ptl_size_t length, ptl_process_t target, + ptl_match_bits_t match_bits, ptl_size_t remote_offset, + ompi_mtl_portals4_recv_request_t *request) +{ + int ret; + + ret = PtlCTAlloc(ompi_mtl_portals4.ni_h, &request->ct_h); + if (OPAL_UNLIKELY(PTL_OK != ret)) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: PtlGet failed: %d", + __FILE__, __LINE__, ret); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ret = PtlTriggeredGet(ompi_mtl_portals4.send_md_h, + (ptl_size_t) start, + length, + target, + ompi_mtl_portals4.read_idx, + match_bits, + remote_offset, + request, + request->ct_h, 1); + if (OPAL_UNLIKELY(PTL_OK != ret)) { + PtlCTFree(request->ct_h); + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: PtlTriggeredGet failed: %d", + __FILE__, __LINE__, ret); + return OMPI_ERR_OUT_OF_RESOURCE; + } + return OMPI_SUCCESS; +} + static int read_msg(void *start, ptl_size_t length, ptl_process_t target, ptl_match_bits_t match_bits, ptl_size_t remote_offset, @@ -94,59 +128,61 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, goto callback_error; } - ptl_request->me_h = PTL_INVALID_HANDLE; + if (!ptl_request->is_triggered) { + ptl_request->me_h = PTL_INVALID_HANDLE; - msg_length = MTL_PORTALS4_GET_LENGTH(ev->hdr_data); - ptl_request->super.super.ompi_req->req_status.MPI_SOURCE = - MTL_PORTALS4_GET_SOURCE(ev->match_bits); - ptl_request->super.super.ompi_req->req_status.MPI_TAG = - MTL_PORTALS4_GET_TAG(ev->match_bits); - if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "truncate expected: %ld %ld", - msg_length, ptl_request->delivery_len); - ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; - } + msg_length = ev->mlength; + ptl_request->super.super.ompi_req->req_status.MPI_SOURCE = + MTL_PORTALS4_GET_SOURCE(ev->match_bits); + ptl_request->super.super.ompi_req->req_status.MPI_TAG = + MTL_PORTALS4_GET_TAG(ev->match_bits); + if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "truncate expected: %ld %ld", + msg_length, ptl_request->delivery_len); + ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; + } #if OPAL_ENABLE_DEBUG - ptl_request->hdr_data = ev->hdr_data; + ptl_request->hdr_data = ev->hdr_data; #endif - if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) { - /* If it's not a short message and we're doing rndv, we + if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) { + /* If it's not a short message and we're doing rndv, we only have the first part of the message. Issue the get to pull the second part of the message. */ - ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit, - ((msg_length > ptl_request->delivery_len) ? - ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit, - ev->initiator, - ev->hdr_data, - ompi_mtl_portals4.eager_limit, - ptl_request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); - goto callback_error; - } + ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit, + ((msg_length > ptl_request->delivery_len) ? + ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit, + ev->initiator, + ev->hdr_data, + ompi_mtl_portals4.eager_limit, + ptl_request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); + goto callback_error; + } - } else { - /* If we're either using the eager protocol or were a + } else { + /* If we're either using the eager protocol or were a short message, all data has been received, so complete the message. */ - ret = ompi_mtl_datatype_unpack(ptl_request->convertor, - ev->start, - ev->mlength); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: ompi_mtl_datatype_unpack failed: %d", - __FILE__, __LINE__, ret); - ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret; - } - ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; + ret = ompi_mtl_datatype_unpack(ptl_request->convertor, + ev->start, + ev->mlength); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: ompi_mtl_datatype_unpack failed: %d", + __FILE__, __LINE__, ret); + ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret; + } + ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; - OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, - "Recv %lu (0x%lx) completed, expected", - ptl_request->opcount, ptl_request->hdr_data)); - ptl_request->super.super.completion_callback(&ptl_request->super.super); + OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, + "Recv %lu (0x%lx) completed, expected", + ptl_request->opcount, ptl_request->hdr_data)); + ptl_request->super.super.completion_callback(&ptl_request->super.super); + } } break; @@ -162,8 +198,11 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, goto callback_error; } + if (ptl_request->is_triggered) + PtlCTFree(ptl_request->ct_h); + /* set the received length in the status, now that we know - excatly how much data was sent. */ + exactly how much data was sent. */ ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; if (ompi_mtl_portals4.protocol == rndv) { ptl_request->super.super.ompi_req->req_status._ucount += @@ -207,95 +246,96 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, goto callback_error; } - ptl_request->me_h = PTL_INVALID_HANDLE; + if (!ptl_request->is_triggered) { + ptl_request->me_h = PTL_INVALID_HANDLE; - msg_length = MTL_PORTALS4_GET_LENGTH(ev->hdr_data); - ptl_request->super.super.ompi_req->req_status.MPI_SOURCE = - MTL_PORTALS4_GET_SOURCE(ev->match_bits); - ptl_request->super.super.ompi_req->req_status.MPI_TAG = - MTL_PORTALS4_GET_TAG(ev->match_bits); - if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "truncate unexpected: %ld %ld %d", - msg_length, ptl_request->delivery_len, - MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)); - ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; - } + msg_length = ev->mlength; + ptl_request->super.super.ompi_req->req_status.MPI_SOURCE = + MTL_PORTALS4_GET_SOURCE(ev->match_bits); + ptl_request->super.super.ompi_req->req_status.MPI_TAG = + MTL_PORTALS4_GET_TAG(ev->match_bits); + if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "truncate unexpected: %ld %ld %d", + msg_length, ptl_request->delivery_len, + MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)); + ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; + } #if OPAL_ENABLE_DEBUG - ptl_request->hdr_data = ev->hdr_data; + ptl_request->hdr_data = ev->hdr_data; #endif - /* overflow case. Short messages have the buffer stashed + /* overflow case. Short messages have the buffer stashed somewhere. Long messages left in buffer at the source */ - if (MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)) { - ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; - if (ev->mlength > 0) { - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - iov.iov_base = (char*) ev->start; - iov.iov_len = ev->mlength; - max_data = iov.iov_len; + if (MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)) { + ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; + if (ev->mlength > 0) { + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data; + iov.iov_base = (char*) ev->start; + iov.iov_len = ev->mlength; + max_data = iov.iov_len; - ret = opal_convertor_unpack(ptl_request->convertor, - &iov, &iov_count, - &max_data ); - if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); - if (OPAL_UNLIKELY(ret < 0)) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: opal_convertor_unpack failed: %d", - __FILE__, __LINE__, ret); - goto callback_error; + ret = opal_convertor_unpack(ptl_request->convertor, + &iov, &iov_count, + &max_data ); + if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); + if (OPAL_UNLIKELY(ret < 0)) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: opal_convertor_unpack failed: %d", + __FILE__, __LINE__, ret); + goto callback_error; + } } - } - /* if it's a sync, send the ack */ - if (MTL_PORTALS4_IS_SYNC_MSG(ev->hdr_data)) { + /* if it's a sync, send the ack */ + if (MTL_PORTALS4_IS_SYNC_MSG(ev->hdr_data)) { + OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, + "Recv %lu (0x%lx) sending sync ack", + ptl_request->opcount, ptl_request->hdr_data)); + ret = PtlPut(ompi_mtl_portals4.zero_md_h, + 0, + 0, + PTL_NO_ACK_REQ, + ev->initiator, + ompi_mtl_portals4.read_idx, + ev->hdr_data, + 0, + NULL, + 0); + if (OPAL_UNLIKELY(PTL_OK != ret)) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: PtlPut failed: %d", + __FILE__, __LINE__, ret); + goto callback_error; + } + } + OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, - "Recv %lu (0x%lx) sending sync ack", - ptl_request->opcount, ptl_request->hdr_data)); - ret = PtlPut(ompi_mtl_portals4.zero_md_h, - 0, - 0, - PTL_NO_ACK_REQ, - ev->initiator, - ompi_mtl_portals4.read_idx, - ev->hdr_data, - 0, - NULL, - 0); - if (OPAL_UNLIKELY(PTL_OK != ret)) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: PtlPut failed: %d", - __FILE__, __LINE__, ret); + "Recv %lu (0x%lx) completed, unexpected short (0x%lx)", + ptl_request->opcount, ptl_request->hdr_data, (long) ev->start)); + ptl_request->super.super.completion_callback(&ptl_request->super.super); + + } else { + if (ev->mlength > 0) { + /* if rndv or triggered, copy the eager part to the right place */ + memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength); + } + + ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength, + ((msg_length > ptl_request->delivery_len) ? + ptl_request->delivery_len : msg_length) - ev->mlength, + ev->initiator, + ev->hdr_data, + ev->mlength, + ptl_request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); goto callback_error; } } - - OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, - "Recv %lu (0x%lx) completed, unexpected short (0x%lx)", - ptl_request->opcount, ptl_request->hdr_data, (long) ev->start)); - ptl_request->super.super.completion_callback(&ptl_request->super.super); - - } else { - if (ev->mlength > 0) { - /* if rndv or triggered, copy the eager part to the right place */ - memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength); - } - - ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength, - ((msg_length > ptl_request->delivery_len) ? - ptl_request->delivery_len : msg_length) - ev->mlength, - ev->initiator, - ev->hdr_data, - ev->mlength, - ptl_request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); - goto callback_error; - } } - break; case PTL_EVENT_LINK: @@ -327,6 +367,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, mca_mtl_request_t *mtl_request) { ptl_match_bits_t match_bits, ignore_bits; + ptl_hdr_data_t hdr_data; int ret = OMPI_SUCCESS; ptl_process_t remote_proc; ompi_mtl_portals4_recv_request_t *ptl_request = @@ -353,11 +394,28 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, MTL_PORTALS4_SET_RECV_BITS(match_bits, ignore_bits, comm->c_contextid, src, tag); + MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, comm->c_contextid, 0); + ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } + ptl_request->is_triggered = + ((ompi_mtl_portals4.protocol == eager) || + (ompi_mtl_portals4.eager_limit >= length) || + (MPI_ANY_SOURCE == src) || + (MPI_ANY_TAG == tag)) ? false : true; + + if (ptl_request->is_triggered) { + ret = triggered_read_msg(ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit, + ptl_request->delivery_len - ompi_mtl_portals4.eager_limit, + remote_proc, + hdr_data, + ompi_mtl_portals4.eager_limit, + ptl_request); + } + ptl_request->super.type = portals4_req_recv; ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress; #if OPAL_ENABLE_DEBUG @@ -372,20 +430,26 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, - "Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n", + "Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx, 0x%lx)\n", ptl_request->opcount, remote_proc.phys.nid, remote_proc.phys.pid, - (int64_t)length, match_bits, ignore_bits, (unsigned long) ptl_request)); + (int64_t)length, match_bits, ignore_bits, hdr_data, (unsigned long) ptl_request)); me.start = start; me.length = length; - me.ct_handle = PTL_CT_NONE; + if (ptl_request->is_triggered) + me.ct_handle = ptl_request->ct_h; + else + me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = ompi_mtl_portals4.uid; me.options = PTL_ME_OP_PUT | PTL_ME_USE_ONCE | PTL_ME_EVENT_UNLINK_DISABLE; + if (ptl_request->is_triggered) + me.options |= PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; + if (length <= ompi_mtl_portals4.eager_limit) { me.options |= PTL_ME_EVENT_LINK_DISABLE; } diff --git a/ompi/mca/mtl/portals4/mtl_portals4_request.h b/ompi/mca/mtl/portals4/mtl_portals4_request.h index f76846115c..a615fb5b3b 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_request.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_request.h @@ -69,6 +69,8 @@ struct ompi_mtl_portals4_recv_request_t { ompi_mtl_portals4_base_request_t super; void *buffer_ptr; ptl_handle_me_t me_h; + ptl_handle_ct_t ct_h; + bool is_triggered; struct opal_convertor_t *convertor; void *delivery_ptr; size_t delivery_len; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_send.c b/ompi/mca/mtl/portals4/mtl_portals4_send.c index 4ee2e77532..647d3fad96 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_send.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_send.c @@ -189,8 +189,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode, MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, localrank, tag, MTL_PORTALS4_SHORT_MSG); - MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length, - (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) ? 1 : 0); + MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, contextid, (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) ? 1 : 0); if (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) { me.start = NULL; @@ -274,7 +273,7 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag, MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, localrank, tag, MTL_PORTALS4_LONG_MSG); - MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length, 0); + MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, contextid, 0); me.start = start; me.length = length;