1
1

MTL portals4 : improve the rendez-vous protocol using PtlTriggeredGet operation

Этот коммит содержится в:
Francois WELLENREITER 2015-11-09 17:04:26 +01:00
родитель 9e5ade4e8b
Коммит 9126ea5e82
6 изменённых файлов: 206 добавлений и 132 удалений

Просмотреть файл

@ -178,6 +178,7 @@ portals4_init_interface(void)
me.ignore_bits = MTL_PORTALS4_CONTEXT_MASK |
MTL_PORTALS4_SOURCE_MASK |
MTL_PORTALS4_TAG_MASK;
ret = PtlMEAppend(ompi_mtl_portals4.ni_h,
ompi_mtl_portals4.recv_idx,
&me,

Просмотреть файл

@ -202,18 +202,26 @@ extern mca_mtl_portals4_module_t ompi_mtl_portals4;
((int)((match_bits & MTL_PORTALS4_SOURCE_MASK) >> 24))
/* hda_data bit manipulation
*
* 0 1234567 01234567 01234567 0123 4567 01234567 01234567 01234567 01234567
* | | |
* ^| | context id | message tag
* || | |
* +---- is_sync
*/
#define MTL_PORTALS4_SYNC_MSG 0x8000000000000000ULL
#define MTL_PORTALS4_SET_HDR_DATA(hdr_data, opcount, length, sync) \
#define MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, contextid, sync) \
{ \
hdr_data = (sync) ? 1 : 0; \
hdr_data = (hdr_data << 15); \
hdr_data |= opcount & 0x7FFFULL; \
hdr_data = (hdr_data << 48); \
hdr_data |= (length & 0xFFFFFFFFFFFFULL); \
hdr_data = (hdr_data << 39); \
hdr_data |= contextid; \
hdr_data = (hdr_data << 24); \
hdr_data |= (MTL_PORTALS4_TAG_MASK & tag); \
}
#define MTL_PORTALS4_GET_LENGTH(hdr_data) ((size_t)(hdr_data & 0xFFFFFFFFFFFFULL))
#define MTL_PORTALS4_IS_SYNC_MSG(hdr_data) \
(0 != (MTL_PORTALS4_SYNC_MSG & hdr_data))

Просмотреть файл

@ -41,7 +41,7 @@ completion_fn(ptl_event_t *ev, ompi_mtl_portals4_base_request_t *ptl_base_reques
ptl_request->status.MPI_SOURCE = MTL_PORTALS4_GET_SOURCE(ev->match_bits);
ptl_request->status.MPI_TAG = MTL_PORTALS4_GET_TAG(ev->match_bits);
ptl_request->status.MPI_ERROR = MPI_SUCCESS;
ptl_request->status._ucount = MTL_PORTALS4_GET_LENGTH(ev->hdr_data);
ptl_request->status._ucount += ev->mlength;
if (ev->type != PTL_EVENT_SEARCH) {
ptl_request->message = ompi_mtl_portals4_message_alloc(ev);
}

Просмотреть файл

@ -34,6 +34,40 @@
#include "mtl_portals4_recv_short.h"
#include "mtl_portals4_message.h"
static int
triggered_read_msg(void *start, ptl_size_t length, ptl_process_t target,
ptl_match_bits_t match_bits, ptl_size_t remote_offset,
ompi_mtl_portals4_recv_request_t *request)
{
int ret;
ret = PtlCTAlloc(ompi_mtl_portals4.ni_h, &request->ct_h);
if (OPAL_UNLIKELY(PTL_OK != ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PtlGet failed: %d",
__FILE__, __LINE__, ret);
return OMPI_ERR_OUT_OF_RESOURCE;
}
ret = PtlTriggeredGet(ompi_mtl_portals4.send_md_h,
(ptl_size_t) start,
length,
target,
ompi_mtl_portals4.read_idx,
match_bits,
remote_offset,
request,
request->ct_h, 1);
if (OPAL_UNLIKELY(PTL_OK != ret)) {
PtlCTFree(request->ct_h);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PtlTriggeredGet failed: %d",
__FILE__, __LINE__, ret);
return OMPI_ERR_OUT_OF_RESOURCE;
}
return OMPI_SUCCESS;
}
static int
read_msg(void *start, ptl_size_t length, ptl_process_t target,
ptl_match_bits_t match_bits, ptl_size_t remote_offset,
@ -94,59 +128,61 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
goto callback_error;
}
ptl_request->me_h = PTL_INVALID_HANDLE;
if (!ptl_request->is_triggered) {
ptl_request->me_h = PTL_INVALID_HANDLE;
msg_length = MTL_PORTALS4_GET_LENGTH(ev->hdr_data);
ptl_request->super.super.ompi_req->req_status.MPI_SOURCE =
MTL_PORTALS4_GET_SOURCE(ev->match_bits);
ptl_request->super.super.ompi_req->req_status.MPI_TAG =
MTL_PORTALS4_GET_TAG(ev->match_bits);
if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"truncate expected: %ld %ld",
msg_length, ptl_request->delivery_len);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
}
msg_length = ev->mlength;
ptl_request->super.super.ompi_req->req_status.MPI_SOURCE =
MTL_PORTALS4_GET_SOURCE(ev->match_bits);
ptl_request->super.super.ompi_req->req_status.MPI_TAG =
MTL_PORTALS4_GET_TAG(ev->match_bits);
if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"truncate expected: %ld %ld",
msg_length, ptl_request->delivery_len);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
}
#if OPAL_ENABLE_DEBUG
ptl_request->hdr_data = ev->hdr_data;
ptl_request->hdr_data = ev->hdr_data;
#endif
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) {
/* If it's not a short message and we're doing rndv, we
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) {
/* If it's not a short message and we're doing rndv, we
only have the first part of the message. Issue the get
to pull the second part of the message. */
ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit,
((msg_length > ptl_request->delivery_len) ?
ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit,
ev->initiator,
ev->hdr_data,
ompi_mtl_portals4.eager_limit,
ptl_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
goto callback_error;
}
ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit,
((msg_length > ptl_request->delivery_len) ?
ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit,
ev->initiator,
ev->hdr_data,
ompi_mtl_portals4.eager_limit,
ptl_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
goto callback_error;
}
} else {
/* If we're either using the eager protocol or were a
} else {
/* If we're either using the eager protocol or were a
short message, all data has been received, so complete
the message. */
ret = ompi_mtl_datatype_unpack(ptl_request->convertor,
ev->start,
ev->mlength);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: ompi_mtl_datatype_unpack failed: %d",
__FILE__, __LINE__, ret);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret;
}
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
ret = ompi_mtl_datatype_unpack(ptl_request->convertor,
ev->start,
ev->mlength);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: ompi_mtl_datatype_unpack failed: %d",
__FILE__, __LINE__, ret);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret;
}
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) completed, expected",
ptl_request->opcount, ptl_request->hdr_data));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) completed, expected",
ptl_request->opcount, ptl_request->hdr_data));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
}
}
break;
@ -162,8 +198,11 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
goto callback_error;
}
if (ptl_request->is_triggered)
PtlCTFree(ptl_request->ct_h);
/* set the received length in the status, now that we know
excatly how much data was sent. */
exactly how much data was sent. */
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
if (ompi_mtl_portals4.protocol == rndv) {
ptl_request->super.super.ompi_req->req_status._ucount +=
@ -207,95 +246,96 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
goto callback_error;
}
ptl_request->me_h = PTL_INVALID_HANDLE;
if (!ptl_request->is_triggered) {
ptl_request->me_h = PTL_INVALID_HANDLE;
msg_length = MTL_PORTALS4_GET_LENGTH(ev->hdr_data);
ptl_request->super.super.ompi_req->req_status.MPI_SOURCE =
MTL_PORTALS4_GET_SOURCE(ev->match_bits);
ptl_request->super.super.ompi_req->req_status.MPI_TAG =
MTL_PORTALS4_GET_TAG(ev->match_bits);
if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"truncate unexpected: %ld %ld %d",
msg_length, ptl_request->delivery_len,
MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits));
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
}
msg_length = ev->mlength;
ptl_request->super.super.ompi_req->req_status.MPI_SOURCE =
MTL_PORTALS4_GET_SOURCE(ev->match_bits);
ptl_request->super.super.ompi_req->req_status.MPI_TAG =
MTL_PORTALS4_GET_TAG(ev->match_bits);
if (OPAL_UNLIKELY(msg_length > ptl_request->delivery_len)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"truncate unexpected: %ld %ld %d",
msg_length, ptl_request->delivery_len,
MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits));
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
}
#if OPAL_ENABLE_DEBUG
ptl_request->hdr_data = ev->hdr_data;
ptl_request->hdr_data = ev->hdr_data;
#endif
/* overflow case. Short messages have the buffer stashed
/* overflow case. Short messages have the buffer stashed
somewhere. Long messages left in buffer at the source */
if (MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)) {
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
if (ev->mlength > 0) {
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data;
iov.iov_base = (char*) ev->start;
iov.iov_len = ev->mlength;
max_data = iov.iov_len;
if (MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)) {
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
if (ev->mlength > 0) {
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data;
iov.iov_base = (char*) ev->start;
iov.iov_len = ev->mlength;
max_data = iov.iov_len;
ret = opal_convertor_unpack(ptl_request->convertor,
&iov, &iov_count,
&max_data );
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
if (OPAL_UNLIKELY(ret < 0)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: opal_convertor_unpack failed: %d",
__FILE__, __LINE__, ret);
goto callback_error;
ret = opal_convertor_unpack(ptl_request->convertor,
&iov, &iov_count,
&max_data );
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
if (OPAL_UNLIKELY(ret < 0)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: opal_convertor_unpack failed: %d",
__FILE__, __LINE__, ret);
goto callback_error;
}
}
}
/* if it's a sync, send the ack */
if (MTL_PORTALS4_IS_SYNC_MSG(ev->hdr_data)) {
/* if it's a sync, send the ack */
if (MTL_PORTALS4_IS_SYNC_MSG(ev->hdr_data)) {
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) sending sync ack",
ptl_request->opcount, ptl_request->hdr_data));
ret = PtlPut(ompi_mtl_portals4.zero_md_h,
0,
0,
PTL_NO_ACK_REQ,
ev->initiator,
ompi_mtl_portals4.read_idx,
ev->hdr_data,
0,
NULL,
0);
if (OPAL_UNLIKELY(PTL_OK != ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PtlPut failed: %d",
__FILE__, __LINE__, ret);
goto callback_error;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) sending sync ack",
ptl_request->opcount, ptl_request->hdr_data));
ret = PtlPut(ompi_mtl_portals4.zero_md_h,
0,
0,
PTL_NO_ACK_REQ,
ev->initiator,
ompi_mtl_portals4.read_idx,
ev->hdr_data,
0,
NULL,
0);
if (OPAL_UNLIKELY(PTL_OK != ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PtlPut failed: %d",
__FILE__, __LINE__, ret);
"Recv %lu (0x%lx) completed, unexpected short (0x%lx)",
ptl_request->opcount, ptl_request->hdr_data, (long) ev->start));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
} else {
if (ev->mlength > 0) {
/* if rndv or triggered, copy the eager part to the right place */
memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength);
}
ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength,
((msg_length > ptl_request->delivery_len) ?
ptl_request->delivery_len : msg_length) - ev->mlength,
ev->initiator,
ev->hdr_data,
ev->mlength,
ptl_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
goto callback_error;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) completed, unexpected short (0x%lx)",
ptl_request->opcount, ptl_request->hdr_data, (long) ev->start));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
} else {
if (ev->mlength > 0) {
/* if rndv or triggered, copy the eager part to the right place */
memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength);
}
ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength,
((msg_length > ptl_request->delivery_len) ?
ptl_request->delivery_len : msg_length) - ev->mlength,
ev->initiator,
ev->hdr_data,
ev->mlength,
ptl_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
goto callback_error;
}
}
break;
case PTL_EVENT_LINK:
@ -327,6 +367,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
mca_mtl_request_t *mtl_request)
{
ptl_match_bits_t match_bits, ignore_bits;
ptl_hdr_data_t hdr_data;
int ret = OMPI_SUCCESS;
ptl_process_t remote_proc;
ompi_mtl_portals4_recv_request_t *ptl_request =
@ -353,11 +394,28 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
MTL_PORTALS4_SET_RECV_BITS(match_bits, ignore_bits, comm->c_contextid,
src, tag);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, comm->c_contextid, 0);
ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
ptl_request->is_triggered =
((ompi_mtl_portals4.protocol == eager) ||
(ompi_mtl_portals4.eager_limit >= length) ||
(MPI_ANY_SOURCE == src) ||
(MPI_ANY_TAG == tag)) ? false : true;
if (ptl_request->is_triggered) {
ret = triggered_read_msg(ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit,
ptl_request->delivery_len - ompi_mtl_portals4.eager_limit,
remote_proc,
hdr_data,
ompi_mtl_portals4.eager_limit,
ptl_request);
}
ptl_request->super.type = portals4_req_recv;
ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress;
#if OPAL_ENABLE_DEBUG
@ -372,20 +430,26 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n",
"Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx, 0x%lx)\n",
ptl_request->opcount,
remote_proc.phys.nid, remote_proc.phys.pid,
(int64_t)length, match_bits, ignore_bits, (unsigned long) ptl_request));
(int64_t)length, match_bits, ignore_bits, hdr_data, (unsigned long) ptl_request));
me.start = start;
me.length = length;
me.ct_handle = PTL_CT_NONE;
if (ptl_request->is_triggered)
me.ct_handle = ptl_request->ct_h;
else
me.ct_handle = PTL_CT_NONE;
me.min_free = 0;
me.uid = ompi_mtl_portals4.uid;
me.options =
PTL_ME_OP_PUT |
PTL_ME_USE_ONCE |
PTL_ME_EVENT_UNLINK_DISABLE;
if (ptl_request->is_triggered)
me.options |= PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW;
if (length <= ompi_mtl_portals4.eager_limit) {
me.options |= PTL_ME_EVENT_LINK_DISABLE;
}

Просмотреть файл

@ -69,6 +69,8 @@ struct ompi_mtl_portals4_recv_request_t {
ompi_mtl_portals4_base_request_t super;
void *buffer_ptr;
ptl_handle_me_t me_h;
ptl_handle_ct_t ct_h;
bool is_triggered;
struct opal_convertor_t *convertor;
void *delivery_ptr;
size_t delivery_len;

Просмотреть файл

@ -189,8 +189,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, localrank, tag,
MTL_PORTALS4_SHORT_MSG);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length,
(MCA_PML_BASE_SEND_SYNCHRONOUS == mode) ? 1 : 0);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, contextid, (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) ? 1 : 0);
if (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) {
me.start = NULL;
@ -274,7 +273,7 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag,
MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, localrank, tag,
MTL_PORTALS4_LONG_MSG);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length, 0);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, tag, contextid, 0);
me.start = start;
me.length = length;