1
1

* Leverage hdr_data and opcount to improve debugging

* Clean up handling of short synchronous messages

This commit was SVN r25208.
Этот коммит содержится в:
Brian Barrett 2011-09-28 21:18:47 +00:00
родитель 71d8300607
Коммит bb9e73232a
7 изменённых файлов: 105 добавлений и 122 удалений

Просмотреть файл

@ -166,7 +166,7 @@ ompi_mtl_portals4_progress(void)
while (true) {
ret = PtlEQGet(ompi_mtl_portals4.eq_h, &ev);
if (PTL_OK == ret) {
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
OPAL_OUTPUT_VERBOSE((60, ompi_mtl_base_output,
"Found event of type %d\n", ev.type));
switch (ev.type) {
case PTL_EVENT_GET:
@ -234,77 +234,6 @@ ompi_mtl_portals4_progress(void)
"Error returned from PtlEQGet: %d", ret);
abort();
}
ret = PtlEQGet(ompi_mtl_portals4.tmp_eq_h, &ev);
if (PTL_OK == ret) {
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"Found event of type %d\n", ev.type));
switch (ev.type) {
case PTL_EVENT_GET:
case PTL_EVENT_PUT:
case PTL_EVENT_PUT_OVERFLOW:
case PTL_EVENT_ATOMIC:
case PTL_EVENT_ATOMIC_OVERFLOW:
if (NULL != ev.user_ptr) {
ptl_request = ev.user_ptr;
ret = ptl_request->event_callback(&ev, ptl_request);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_output,
"Error returned from target event callback: %d", ret);
abort();
}
}
break;
case PTL_EVENT_REPLY:
case PTL_EVENT_SEND:
case PTL_EVENT_ACK:
if (NULL != ev.user_ptr) {
ptl_request = ev.user_ptr;
ret = ptl_request->event_callback(&ev, ptl_request);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_output,
"Error returned from initiator event callback: %d", ret);
abort();
}
}
break;
case PTL_EVENT_PT_DISABLED:
/* do stuff - flow control */
opal_output(ompi_mtl_base_output, "Unhandled read flow control event.");
abort();
break;
case PTL_EVENT_AUTO_UNLINK:
break;
case PTL_EVENT_AUTO_FREE:
if (OMPI_SUCCESS != (ret = ompi_mtl_portals4_recv_short_block_repost(&ev))) {
opal_output(ompi_mtl_base_output,
"Error returned from PTL_EVENT_FREE callback: %d", ret);
abort();
}
break;
case PTL_EVENT_SEARCH:
if (NULL != ev.user_ptr) {
ptl_request = ev.user_ptr;
ret = ptl_request->event_callback(&ev, ptl_request);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_output,
"Error returned from target event callback: %d", ret);
abort();
}
}
break;
default:
opal_output(ompi_mtl_base_output,
"Unknown event type %d (error: %d)", (int)ev.type, ret);
abort();
}
} else if (PTL_EQ_EMPTY == ret) {
break;
} else {
opal_output(ompi_mtl_base_output,
"Error returned from PtlEQGet: %d", ret);
abort();
}
}
return count;

Просмотреть файл

@ -48,7 +48,6 @@ struct mca_mtl_portals4_module_t {
/* global handles */
ptl_handle_ni_t ni_h;
ptl_handle_eq_t eq_h;
ptl_handle_eq_t tmp_eq_h;
/* for zero-length sends and acks */
ptl_handle_md_t zero_md_h;
@ -58,8 +57,11 @@ struct mca_mtl_portals4_module_t {
opal_list_t recv_short_blocks;
/* number of send-side operations started */
/* number of operations started */
uint32_t opcount;
#if OPAL_ENABLE_DEBUG
uint32_t recv_opcount;
#endif
enum { eager, rndv } protocol;
};
@ -91,9 +93,8 @@ extern mca_mtl_portals4_module_t ompi_mtl_portals4;
#define MTL_PORTALS4_TAG_IGNR 0x000000007FFFFFFFULL
#define MTL_PORTALS4_SHORT_MSG 0x1000000000000000ULL
#define MTL_PORTALS4_SHORT_SYNC_MSG 0x2000000000000000ULL
#define MTL_PORTALS4_LONG_MSG 0x4000000000000000ULL
#define MTL_PORTALS4_READY_MSG 0x8000000000000000ULL
#define MTL_PORTALS4_LONG_MSG 0x2000000000000000ULL
#define MTL_PORTALS4_READY_MSG 0x4000000000000000ULL
/* send posting */
#define MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, source, tag, type) \
@ -135,22 +136,27 @@ extern mca_mtl_portals4_module_t ompi_mtl_portals4;
(0 != (MTL_PORTALS4_LONG_MSG & match_bits))
#define MTL_PORTALS4_IS_READY_MSG(match_bits) \
(0 != (MTL_PORTALS4_READY_MSG & match_bits))
#define MTL_PORTALS4_IS_SYNC_MSG(match_bits) \
(0 != (MTL_PORTALS4_SHORT_SYNC_MSG & match_bits))
#define MTL_PORTALS4_GET_TAG(match_bits) \
((int)(match_bits & MTL_PORTALS4_TAG_MASK))
#define MTL_PORTALS4_GET_SOURCE(match_bits) \
((int)((match_bits & MTL_PORTALS4_SOURCE_MASK) >> 32))
#define MTL_PORTALS4_SET_HDR_DATA(hdr_data, opcount, length) \
{ \
hdr_data = opcount & 0xFFFFULL; \
hdr_data = (hdr_data << 48); \
hdr_data |= (length & 0xFFFFFFFFFFFFULL); \
#define MTL_PORTALS4_SYNC_MSG 0x8000000000000000ULL
#define MTL_PORTALS4_SET_HDR_DATA(hdr_data, opcount, length, sync) \
{ \
hdr_data = (sync) ? 1 : 0; \
hdr_data = (hdr_data << 15); \
hdr_data |= opcount & 0x7FFFULL; \
hdr_data = (hdr_data << 48); \
hdr_data |= (length & 0xFFFFFFFFFFFFULL); \
}
#define MTL_PORTALS4_GET_LENGTH(hdr_data) ((size_t)(hdr_data & 0xFFFFFFFFFFFFULL))
#define MTL_PORTALS4_IS_SYNC_MSG(hdr_data) \
(0 != (MTL_PORTALS4_SYNC_MSG & hdr_data))
/* MTL interface functions */
extern int ompi_mtl_portals4_finalize(struct mca_mtl_base_module_t *mtl);

Просмотреть файл

@ -200,6 +200,10 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
goto error;
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"My nid,pid = %x,%x",
id.phys.nid, id.phys.pid));
/* create event queue */
ret = PtlEQAlloc(ompi_mtl_portals4.ni_h,
ompi_mtl_portals4.queue_size,
@ -211,16 +215,6 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
goto error;
}
ret = PtlEQAlloc(ompi_mtl_portals4.ni_h,
ompi_mtl_portals4.queue_size,
&ompi_mtl_portals4.tmp_eq_h);
if (PTL_OK != ret) {
opal_output(ompi_mtl_base_output,
"%s:%d: PtlEQAlloc failed: %d\n",
__FILE__, __LINE__, ret);
goto error;
}
/* Create portal table entries */
ret = PtlPTAlloc(ompi_mtl_portals4.ni_h,
PTL_PT_FLOWCTRL,
@ -235,7 +229,7 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
}
ret = PtlPTAlloc(ompi_mtl_portals4.ni_h,
PTL_PT_FLOWCTRL,
ompi_mtl_portals4.tmp_eq_h,
ompi_mtl_portals4.eq_h,
REQ_READ_TABLE_ID,
&ompi_mtl_portals4.read_idx);
if (PTL_OK != ret) {
@ -296,6 +290,9 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
}
ompi_mtl_portals4.opcount = 0;
#if OPAL_ENABLE_DEBUG
ompi_mtl_portals4.recv_opcount = 0;
#endif
/* activate progress callback */
ret = opal_progress_register(ompi_mtl_portals4_progress);

Просмотреть файл

@ -110,5 +110,5 @@ ompi_mtl_portals4_iprobe(struct mca_mtl_base_module_t* mtl,
*status = request.status;
}
return OMPI_ERR_NOT_IMPLEMENTED;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -44,6 +44,9 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
switch (ev->type) {
case PTL_EVENT_PUT:
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) got put event",
ptl_request->opcount, ev->hdr_data));
if (ev->ni_fail_type != PTL_NI_OK) {
opal_output(ompi_mtl_base_output,
"%s:%d: PTL_EVENT_PUT with ni_fail_type: %d",
@ -57,9 +60,15 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
ptl_request->super.super.ompi_req->req_status.MPI_TAG =
MTL_PORTALS4_GET_TAG(ev->match_bits);
if (msg_length > ptl_request->delivery_len) {
opal_output(ompi_mtl_base_output, "truncate: %d %d",
msg_length, ptl_request->delivery_len);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
}
#if OPAL_ENABLE_DEBUG
ptl_request->hdr_data = ev->hdr_data;
#endif
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) {
ptl_md_t md;
@ -111,12 +120,16 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
}
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "recv completed"));
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) completed, expected",
ptl_request->opcount, ptl_request->hdr_data));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
}
break;
case PTL_EVENT_REPLY:
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) got reply event",
ptl_request->opcount, ptl_request->hdr_data));
if (ev->ni_fail_type != PTL_NI_OK) {
opal_output(ompi_mtl_base_output,
"%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d",
@ -144,11 +157,15 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
}
PtlMDRelease(ptl_request->md_h);
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "recv completed"));
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) completed, reply",
ptl_request->opcount, ptl_request->hdr_data));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
break;
case PTL_EVENT_PUT_OVERFLOW:
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) got put_overflow event",
ptl_request->opcount, ev->hdr_data));
if (ev->ni_fail_type != PTL_NI_OK) {
opal_output(ompi_mtl_base_output,
"%s:%d: PTL_EVENT_PUT_OVERFLOW with ni_fail_type: %d",
@ -162,9 +179,15 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
ptl_request->super.super.ompi_req->req_status.MPI_TAG =
MTL_PORTALS4_GET_TAG(ev->match_bits);
if (msg_length > ptl_request->delivery_len) {
opal_output(ompi_mtl_base_output, "truncate: %d %d",
msg_length, ptl_request->delivery_len);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
}
#if OPAL_ENABLE_DEBUG
ptl_request->hdr_data = ev->hdr_data;
#endif
/* overflow case. Short messages have the buffer stashed
somewhere. Long messages left in buffer at the source */
if (MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits)) {
@ -188,9 +211,10 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
goto callback_error;
}
}
/* if it's a sync, send the ack */
if (MTL_PORTALS4_IS_SYNC_MSG(ev->match_bits)) {
if (MTL_PORTALS4_IS_SYNC_MSG(ev->hdr_data)) {
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) sending sync ack",
ptl_request->opcount, ptl_request->hdr_data));
ret = PtlPut(ompi_mtl_portals4.zero_md_h,
0,
0,
@ -209,7 +233,8 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "recv completed"));
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) completed, unexpected short",
ptl_request->opcount, ptl_request->hdr_data));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
} else {
@ -233,6 +258,8 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
goto callback_error;
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Recv %d (0x%lx) getting long data",
ptl_request->opcount, ptl_request->hdr_data));
ret = PtlGet(ptl_request->md_h,
0,
md.length,
@ -309,6 +336,10 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
return ret;
}
#if OPAL_ENABLE_DEBUG
ptl_request->opcount = ++ompi_mtl_portals4.recv_opcount;
ptl_request->hdr_data = 0;
#endif
ptl_request->super.event_callback = ompi_mtl_portals4_recv_progress;
ptl_request->buffer_ptr = (free_after) ? start : NULL;
ptl_request->convertor = convertor;
@ -317,7 +348,8 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"Recv from %x,%x of length %d\n",
"Recv %d from %x,%x of length %d\n",
ptl_request->opcount,
remote_proc.phys.nid, remote_proc.phys.pid,
(int)length));

Просмотреть файл

@ -37,6 +37,7 @@ struct ompi_mtl_portals4_send_request_t {
ptl_handle_md_t md_h;
ptl_handle_me_t me_h;
volatile int event_count;
int opcount;
};
typedef struct ompi_mtl_portals4_send_request_t ompi_mtl_portals4_send_request_t;
@ -46,10 +47,13 @@ struct ompi_mtl_portals4_recv_request_t {
void *buffer_ptr;
ptl_handle_md_t md_h;
ptl_handle_me_t me_h;
ptl_handle_ct_t ct_h;
struct opal_convertor_t *convertor;
void *delivery_ptr;
size_t delivery_len;
#if OPAL_ENABLE_DEBUG
int opcount;
ptl_hdr_data_t hdr_data;
#endif
};
typedef struct ompi_mtl_portals4_recv_request_t ompi_mtl_portals4_recv_request_t;

Просмотреть файл

@ -30,24 +30,28 @@
static int
ompi_mtl_portals4_callback(ptl_event_t *ev, struct ompi_mtl_portals4_base_request_t* ptl_base_request)
ompi_mtl_portals4_send_callback(ptl_event_t *ev, struct ompi_mtl_portals4_base_request_t* ptl_base_request)
{
int ret;
ompi_mtl_portals4_send_request_t* ptl_request =
(ompi_mtl_portals4_send_request_t*) ptl_base_request;
assert(ev->type == PTL_EVENT_SEND || ev->type == PTL_EVENT_ACK || ev->type == PTL_EVENT_GET);
assert(NULL != ptl_request->super.super.ompi_req);
if (ev->ni_fail_type != PTL_NI_OK) {
opal_output_verbose(1, ompi_mtl_base_output,
"%s:%d: long send callback ni_fail_type: %d",
__FILE__, __LINE__, ev->ni_fail_type);
opal_output(ompi_mtl_base_output,
"%s:%d: send callback ni_fail_type: %d",
__FILE__, __LINE__, ev->ni_fail_type);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_ERROR;
ptl_request->super.super.completion_callback(&ptl_request->super.super);
abort();
return OMPI_ERROR;
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"Send %d got event of type %d",
ptl_request->opcount, ev->type));
/* we only receive an ack if the message was received into an
expected message. Otherwise, we don't get an ack, but mark
completion when the message was pulled (long message). A short
@ -59,11 +63,12 @@ ompi_mtl_portals4_callback(ptl_event_t *ev, struct ompi_mtl_portals4_base_reques
ret = PtlMDRelease(ptl_request->md_h);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_mtl_base_output,
"%s:%d: long send callback PtlMDRelease returned %d",
"%s:%d: send callback PtlMDRelease returned %d",
__FILE__, __LINE__, ret);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_ERROR;
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "send completed"));
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, "Send %d completed",
ptl_request->opcount));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
}
@ -72,7 +77,7 @@ ompi_mtl_portals4_callback(ptl_event_t *ev, struct ompi_mtl_portals4_base_reques
ret = PtlMEUnlink(ptl_request->me_h);
if (PTL_OK != ret) {
opal_output_verbose(1, ompi_mtl_base_output,
"%s:%d: long send callback PtlMDUnlink returned %d",
"%s:%d: send callback PtlMDUnlink returned %d",
__FILE__, __LINE__, ret);
}
}
@ -93,13 +98,13 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
ptl_hdr_data_t hdr_data;
ptl_md_t md;
ptl_request->super.event_callback = ompi_mtl_portals4_callback;
ptl_request->super.event_callback = ompi_mtl_portals4_send_callback;
ptl_request->event_count = 1;
mode_bits = (MCA_PML_BASE_SEND_READY != mode) ? MTL_PORTALS4_SHORT_MSG : MTL_PORTALS4_READY_MSG;
MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, localrank, tag, mode_bits);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ompi_mtl_portals4.opcount, length);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length, 0);
md.start = start;
md.length = length;
@ -117,6 +122,10 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
return ompi_mtl_portals4_get_error(ret);
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"Send %d short send with hdr_data 0x%lx",
ptl_request->opcount, hdr_data));
ret = PtlPut(ptl_request->md_h,
0,
length,
@ -151,12 +160,12 @@ ompi_mtl_portals4_sync_isend(void *start, int length, int contextid, int tag,
ptl_me_t me;
ptl_hdr_data_t hdr_data;
ptl_request->super.event_callback = ompi_mtl_portals4_callback;
ptl_request->super.event_callback = ompi_mtl_portals4_send_callback;
MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, localrank, tag,
MTL_PORTALS4_SHORT_SYNC_MSG);
MTL_PORTALS4_SHORT_MSG);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ompi_mtl_portals4.opcount, length);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length, 1);
md.start = start;
md.length = length;
@ -198,7 +207,10 @@ ompi_mtl_portals4_sync_isend(void *start, int length, int contextid, int tag,
return ompi_mtl_portals4_get_error(ret);
}
printf("sync send started\n"); fflush(NULL);
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"Send %d short sync send with hdr_data 0x%lx",
ptl_request->opcount, hdr_data));
ret = PtlPut(ptl_request->md_h,
0,
length,
@ -235,11 +247,11 @@ ompi_mtl_portals4_long_isend(void *start, int length, int contextid, int tag,
ptl_me_t me;
ptl_hdr_data_t hdr_data;
ptl_request->super.event_callback = ompi_mtl_portals4_callback;
ptl_request->super.event_callback = ompi_mtl_portals4_send_callback;
MTL_PORTALS4_SET_SEND_BITS(match_bits, contextid, localrank, tag, MTL_PORTALS4_LONG_MSG);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ompi_mtl_portals4.opcount, length);
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length, 0);
md.start = start;
md.length = length;
@ -281,6 +293,10 @@ ompi_mtl_portals4_long_isend(void *start, int length, int contextid, int tag,
return ompi_mtl_portals4_get_error(ret);
}
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"Send %d long send with hdr_data 0x%lx",
ptl_request->opcount, hdr_data));
if (ompi_mtl_portals4.protocol == rndv) {
ret = PtlPut(ptl_request->md_h,
0,
@ -340,15 +356,14 @@ ompi_mtl_portals4_isend(struct mca_mtl_base_module_t* mtl,
ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
if (OMPI_SUCCESS != ret) return ret;
ptl_request->opcount = ++ompi_mtl_portals4.opcount;
ptl_request->buffer_ptr = (free_after) ? start : NULL;
ptl_request->event_count = 0;
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
ompi_mtl_portals4.opcount++;
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output,
"Send %d to %x,%x of length %d\n",
ompi_mtl_portals4.opcount,
ptl_request->opcount,
endpoint->ptl_proc.phys.nid, endpoint->ptl_proc.phys.pid,
(int)length));