1
1

- modified convertor copy_and_prepare routines to accept an addition

flag, new flags to be included when convertor is initialized
- modified pml/btl module defs and added stub functions for diagnostic
  output routines to dump state of queues / endpoints
- updates to data reliability pml

This commit was SVN r9329.
Этот коммит содержится в:
Tim Woodall 2006-03-17 18:46:48 +00:00
родитель 712468dbef
Коммит bd870519fd
40 изменённых файлов: 374 добавлений и 271 удалений

Просмотреть файл

@ -62,7 +62,7 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl)
{
opal_list_item_t *item;
#if OMPI_ENABLE_DEBUG
#if 0 && OMPI_ENABLE_DEBUG
if(opal_list_get_size(&fl->super) != fl->fl_num_allocated) {
opal_output(0, "ompi_free_list: %d allocated %d returned: %s:%d\n",
fl->fl_num_allocated, opal_list_get_size(&fl->super),

Просмотреть файл

@ -87,7 +87,7 @@ inline int32_t ompi_convertor_pack( ompi_convertor_t* pConv,
struct iovec* iov, uint32_t* out_size,
size_t* max_data, int32_t* freeAfter )
{
pConv->checksum = OMPI_CSUM_ZERO;
pConv->checksum = OPAL_CSUM_ZERO;
pConv->csum_ui1 = 0;
pConv->csum_ui2 = 0;
@ -111,7 +111,7 @@ inline int32_t ompi_convertor_unpack( ompi_convertor_t* pConv,
struct iovec* iov, uint32_t* out_size,
size_t* max_data, int32_t* freeAfter )
{
pConv->checksum = OMPI_CSUM_ZERO;
pConv->checksum = OPAL_CSUM_ZERO;
pConv->csum_ui1 = 0;
pConv->csum_ui2 = 0;

Просмотреть файл

@ -210,11 +210,12 @@ ompi_convertor_copy_and_prepare_for_send( const ompi_convertor_t* pSrcConv,
const struct ompi_datatype_t* datatype,
int32_t count,
const void* pUserBuf,
int32_t flags,
ompi_convertor_t* convertor )
{
convertor->remoteArch = pSrcConv->remoteArch;
convertor->pFunctions = pSrcConv->pFunctions;
convertor->flags = pSrcConv->flags & ~CONVERTOR_STATE_MASK;
convertor->flags = (pSrcConv->flags | flags) & ~CONVERTOR_STATE_MASK;
return ompi_convertor_prepare_for_send( convertor, datatype, count, pUserBuf );
}
@ -232,11 +233,12 @@ ompi_convertor_copy_and_prepare_for_recv( const ompi_convertor_t* pSrcConv,
const struct ompi_datatype_t* datatype,
int32_t count,
const void* pUserBuf,
int32_t flags,
ompi_convertor_t* convertor )
{
convertor->remoteArch = pSrcConv->remoteArch;
convertor->pFunctions = pSrcConv->pFunctions;
convertor->flags = pSrcConv->flags & ~CONVERTOR_STATE_MASK;
convertor->flags = (pSrcConv->flags | flags) & ~CONVERTOR_STATE_MASK;
return ompi_convertor_prepare_for_recv( convertor, datatype, count, pUserBuf );
}

Просмотреть файл

@ -15,78 +15,28 @@
#ifndef DATATYPE_CHECKSUM_H_HAS_BEEN_INCLUDED
#define DATATYPE_CHECKSUM_H_HAS_BEEN_INCLUDED
#define OMPI_CSUM_ZERO 0
#define OMPI_REQUIRE_DATA_VALIDATION 0
#if OMPI_REQUIRE_DATA_VALIDATION
#include "ompi/datatype/datatype_memcpy.h"
#include "opal/util/crc.h"
#if defined(CHECKSUM)
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
do { \
/*opal_output( 0, "memcpy dest = %p src = %p length = %d\n", (void*)(DST), (void*)(SRC), (int)(BLENGTH) ); */\
(CONVERTOR)->checksum += opal_bcopy_uicsum_partial( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \
(CONVERTOR)->checksum += OPAL_CSUM_BCOPY_PARTIAL( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \
} while (0)
#define COMPUTE_CSUM( SRC, BLENGTH, CONVERTOR ) \
do { \
/*opal_output( 0, "memcpy dest = %p src = %p length = %d\n", (void*)(DST), (void*)(SRC), (int)(BLENGTH) ); */\
(CONVERTOR)->checksum += opal_uicsum_partial( (SRC), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \
(CONVERTOR)->checksum += OPAL_CSUM_PARTIAL( (SRC), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \
} while (0)
#define OMPI_CSUM( SRC, BLENGTH ) \
opal_uicsum( (SRC), (BLENGTH) )
#else
#include "ompi/datatype/datatype_memcpy.h"
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
MEMCPY( (DST), (SRC), (BLENGTH) )
#define COMPUTE_CSUM( SRC, BLENGTH, CONVERTOR )
#define OMPI_CSUM( SRC, BLENGTH ) OMPI_CSUM_ZERO
#endif
/* ADLER_NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define ADLER_NMAX 5551
#define MOD_ADLER 65521
#if OMPI_REQUIRE_DATA_VALIDATION
#define DO1(buf,i) {_a += buf[i]; _b += _a;}
#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2);
#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
#define DO16(buf) DO8(buf,0); DO8(buf,8);
#define COMPUTE_SPECIFIC_CHECKSUM( DATA, LENGTH, ADLER32) \
do { \
uint8_t *_data = (DATA); /* Pointer to the data to be summed */ \
size_t _len = (LENGTH); /* Length in bytes */ \
uint32_t _a = (ADLER32) & 0xffff, \
_b = ((ADLER32) >> 16) & 0xffff; \
\
while( _len > 0 ) { \
unsigned _tlen = _len > ADLER_NMAX ? ADLER_NMAX : _len; \
_len -= _tlen; \
while( _tlen >= 16 ) { \
DO16(_data); \
_data += 16; \
_tlen -= 16; \
} \
if( 0 != _tlen ) do { \
_a += *_data++; _b += _a; \
} while( --_tlen > 0 ); \
_a = _a % MOD_ADLER; \
_b = _b % MOD_ADLER; \
} \
(ADLER32) = _b << 16 | _a; \
} while(0)
#else
#define COMPUTE_SPECIFIC_CHECKSUM( DATA, LENGTH, ADLER32 )
#endif /* OMPI_REQUIRE_DATA_VALIDATION */
#endif /* DATATYPE_CHECKSUM_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -35,6 +35,8 @@
#include "ompi/class/ompi_free_list.h"
#include "ompi/mca/pml/pml.h"
#define OMPI_ENABLE_DEBUG_RELIABILITY 0
/*
* BML types
*/
@ -236,7 +238,60 @@ static inline void mca_bml_base_free(mca_bml_base_btl_t* bml_btl, mca_btl_base_d
*/
}
static inline int mca_bml_base_send(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des, mca_btl_base_tag_t tag) {
#if OMPI_ENABLE_DEBUG_RELIABILITY
struct mca_bml_base_context_t {
size_t index;
mca_btl_base_completion_fn_t cbfunc;
void* cbdata;
};
typedef struct mca_bml_base_context_t mca_bml_base_context_t;
static inline void mca_bml_base_completion(
struct mca_btl_base_module_t*,
struct mca_btl_base_endpoint_t*,
struct mca_btl_base_descriptor_t*,
int status)
{
mca_bml_base_context_t* ctx = des->des_cbdata;
/* restore original state */
((unsigned char*)des->des_src[0].seg_addr.pval)[ctx->index] ^= ~0;
des->des_cbdata = ctx->cbdata;
des->des_cbfunc = ctx->cbfunc;
free(ctx);
/* invoke original callback */
des->des_cbfunc(btl,ep,des,status);
}
static inline int mca_bml_base_send(
mca_bml_base_btl_t* bml_btl,
mca_btl_base_descriptor_t* des,
mca_btl_base_tag_t tag)
{
static int count;
if(count <= 0) {
count = ((1000.0 * rand())/(RAND_MAX+1.0));
if(count % 2) {
/* local completion - network "drops" packet */
des->des_cbfunc(bml_btl->btl, bml_btl->btl_endpoint, des, OMPI_SUCCESS);
return OMPI_SUCCESS;
} else {
/* corrupt data */
mca_bml_base_context_t* ctx = malloc(sizeof(mca_bml_base_context_t));
if(NULL != ctx) {
ctx->index = (des->des_src[0].seg_len * rand() * 1.0) / (RAND_MAX + 1.0);
ctx->cbfunc = des->des_cbfunc;
ctx->cbdata = des->des_cbdata;
((unsigned char*)des->des_src[0].seg_addr.pval)[ctx->index] ^= ~0;
des->des_cbdata = ctx;
des->des_cbfunc = mca_bml_base_completion;
}
}
}
count--;
des->des_context = (void*) bml_btl;
return bml_btl->btl_send(
bml_btl->btl,
@ -245,6 +300,23 @@ static inline int mca_bml_base_send(mca_bml_base_btl_t* bml_btl, mca_btl_base_de
tag);
}
#else
static inline int mca_bml_base_send(
mca_bml_base_btl_t* bml_btl,
mca_btl_base_descriptor_t* des,
mca_btl_base_tag_t tag)
{
des->des_context = (void*) bml_btl;
return bml_btl->btl_send(
bml_btl->btl,
bml_btl->btl_endpoint,
des,
tag);
}
#endif
static inline int mca_bml_base_put(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) {
des->des_context = (void*) bml_btl;
return bml_btl->btl_put(

Просмотреть файл

@ -57,7 +57,10 @@ OBJ_CLASS_DECLARATION(mca_btl_base_selected_module_t);
OMPI_DECLSPEC int mca_btl_base_open(void);
OMPI_DECLSPEC int mca_btl_base_select(bool enable_progress_threads, bool enable_mpi_threads);
OMPI_DECLSPEC int mca_btl_base_close(void);
OMPI_DECLSPEC void mca_btl_base_dump(mca_btl_base_module_t*);
OMPI_DECLSPEC void mca_btl_base_dump(
struct mca_btl_base_module_t*,
struct mca_btl_base_endpoint_t*,
int verbose);

Просмотреть файл

@ -66,6 +66,9 @@ void mca_btl_base_error_no_nics(const char* transport,
}
void mca_btl_base_dump(mca_btl_base_module_t* btl)
void mca_btl_base_dump(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose)
{
}

Просмотреть файл

@ -489,7 +489,9 @@ typedef int (*mca_btl_base_module_get_fn_t)(
*/
typedef void (*mca_btl_base_module_dump_fn_t)(
struct mca_btl_base_module_t* btl
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose
);

Просмотреть файл

@ -853,7 +853,10 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl)
* Dump state of btl/queues
*/
void mca_btl_mvapi_dump(mca_btl_base_module_t* btl)
void mca_btl_mvapi_dump(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose)
{
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*)btl;

Просмотреть файл

@ -488,6 +488,17 @@ extern void mca_btl_mvapi_send_frag_return(
);
/*
* Dump state of btl/queues
*/
extern void mca_btl_mvapi_dump(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose
);
int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t* mvapi_btl);

Просмотреть файл

@ -100,6 +100,7 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
datatype,
count,
buff,
0,
&convertor))) {
return ret;
}
@ -158,6 +159,7 @@ int mca_coll_sm_bcast_intra(void *buff, int count,
datatype,
count,
buff,
0,
&convertor))) {
return ret;
}

Просмотреть файл

@ -227,6 +227,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
dtype,
segment_ddt_count,
reduce_temp_buffer,
0,
&convertor))) {
free(free_buffer);
return ret;
@ -426,6 +427,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count,
dtype,
count,
sbuf,
0,
&convertor))) {
return ret;
}

Просмотреть файл

@ -508,6 +508,7 @@ ompi_osc_pt2pt_sendreq_recv_put(ompi_osc_pt2pt_module_t *module,
datatype,
header->hdr_target_count,
target,
0,
&convertor);
iov.iov_len = header->hdr_msg_length;
iov.iov_base = inbuf;

Просмотреть файл

@ -61,6 +61,7 @@ ompi_osc_pt2pt_process_op(ompi_osc_pt2pt_module_t *module,
datatype,
header->hdr_target_count,
target_buffer,
0,
&convertor);
/* short circuit the reduction operation MPI_REPLACE - it just

Просмотреть файл

@ -104,6 +104,7 @@ ompi_osc_pt2pt_replyreq_init_target(ompi_osc_pt2pt_replyreq_t *replyreq,
target_dt,
target_count,
target_addr,
0,
&(replyreq->rep_target_convertor));
ompi_convertor_get_packed_size(&replyreq->rep_target_convertor,
&replyreq->rep_target_bytes_packed);

Просмотреть файл

@ -123,6 +123,7 @@ ompi_osc_pt2pt_sendreq_init_origin(ompi_osc_pt2pt_sendreq_t *sendreq,
origin_dt,
origin_count,
origin_addr,
0,
&(sendreq->req_origin_convertor));
ompi_convertor_get_packed_size(&sendreq->req_origin_convertor,
&sendreq->req_origin_bytes_packed);
@ -131,6 +132,7 @@ ompi_osc_pt2pt_sendreq_init_origin(ompi_osc_pt2pt_sendreq_t *sendreq,
origin_dt,
origin_count,
origin_addr,
0,
&(sendreq->req_origin_convertor));
ompi_convertor_get_packed_size(&sendreq->req_origin_convertor,
&sendreq->req_origin_bytes_packed);

Просмотреть файл

@ -105,6 +105,7 @@ typedef struct mca_pml_base_send_request_t mca_pml_base_send_request_t;
(request)->req_base.req_datatype, \
(request)->req_base.req_count, \
(request)->req_base.req_addr, \
0, \
&(request)->req_convertor ); \
ompi_convertor_get_packed_size( &(request)->req_convertor, \
&((request)->req_bytes_packed) );\

Просмотреть файл

@ -52,6 +52,7 @@ mca_pml_dr_t mca_pml_dr = {
mca_pml_dr_iprobe,
mca_pml_dr_probe,
mca_pml_dr_start,
mca_pml_dr_dump,
32768,
INT_MAX
}
@ -173,3 +174,9 @@ int mca_pml_dr_component_fini(void)
return OMPI_SUCCESS;
}
void mca_pml_dr_dump(
struct ompi_communicator_t* comm,
int verbose)
{
}

Просмотреть файл

@ -25,8 +25,9 @@
#include "ompi_config.h"
#include "opal/threads/threads.h"
#include "opal/threads/condition.h"
#include "ompi/class/ompi_free_list.h"
#include "opal/util/cmd_line.h"
#include "opal/util/crc.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/pml_base_request.h"
@ -34,7 +35,6 @@
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/datatype/datatype.h"
#include "ompi/datatype/datatype_internal.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
@ -211,6 +211,11 @@ extern int mca_pml_dr_recv(
ompi_status_public_t* status
);
extern void mca_pml_dr_dump(
struct ompi_communicator_t* comm,
int verbose
);
extern int mca_pml_dr_progress(void);
extern int mca_pml_dr_start(

Просмотреть файл

@ -97,16 +97,16 @@ int mca_pml_dr_component_open(void)
mca_pml_dr_param_register_int("timer_wdog_usec", 0);
mca_pml_dr.timer_wdog_multiplier =
mca_pml_dr_param_register_int("timer_wdog_multiplier", 2);
mca_pml_dr.timer_wdog_multiplier =
mca_pml_dr.timer_wdog_max_count =
mca_pml_dr_param_register_int("timer_wdog_max_count", 10);
mca_pml_dr.timer_ack_sec =
mca_pml_dr_param_register_int("timer_ack_sec", 1);
mca_pml_dr_param_register_int("timer_ack_sec", 10);
mca_pml_dr.timer_ack_usec =
mca_pml_dr_param_register_int("timer_ack_usec", 0);
mca_pml_dr.timer_ack_multiplier =
mca_pml_dr_param_register_int("timer_ack_multiplier", 2);
mca_pml_dr.timer_wdog_multiplier =
mca_pml_dr.timer_ack_max_count =
mca_pml_dr_param_register_int("timer_ack_max_count", 10);
OBJ_CONSTRUCT(&mca_pml_dr.lock, opal_mutex_t);

Просмотреть файл

@ -28,8 +28,6 @@
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/datatype/datatype.h"
#include "ompi/datatype/datatype_internal.h"
#include "ompi/datatype/datatype_checksum.h"
#include "ompi/mca/pml/pml.h"
#include "pml_dr.h"
#include "pml_dr_comm.h"
@ -39,6 +37,21 @@
#include "pml_dr_hdr.h"
#define MCA_PML_DR_HDR_VALIDATE(hdr, type) \
do { \
uint16_t csum = opal_csum(hdr, sizeof(type)); \
if(hdr->hdr_common.hdr_csum != csum) { \
opal_output(0, "%s:%d: invalid header checksum: 0x%04x != 0x%04x\n", \
__FILE__, __LINE__, hdr->hdr_common.hdr_csum, csum); \
return; \
} \
if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { \
opal_output(0, "%s:%d: misdelivered packet [rank %d -> rank %d]\n", \
__FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst); \
return; \
} \
} while (0)
OBJ_CLASS_INSTANCE(
mca_pml_dr_buffer_t,
@ -81,8 +94,6 @@ void mca_pml_dr_recv_frag_callback(
{
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_dr_hdr_t* hdr = (mca_pml_dr_hdr_t*)segments->seg_addr.pval;
uint32_t csum = 1;
size_t hdr_size;
bool duplicate = false;
if(segments->seg_len < sizeof(mca_pml_dr_common_hdr_t)) {
return;
@ -91,75 +102,35 @@ void mca_pml_dr_recv_frag_callback(
switch(hdr->hdr_common.hdr_type) {
case MCA_PML_DR_HDR_TYPE_MATCH:
{
if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t))) {
assert(0);
return;
}
if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) {
assert(0);
return;
}
MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_match_hdr_t);
MCA_PML_DR_RECV_FRAG_CHECK_DUP(hdr, duplicate);
if(duplicate) {
assert(0);
return;
} else {
if(false == duplicate) {
mca_pml_dr_recv_frag_match(btl, &hdr->hdr_match, segments,des->des_dst_cnt);
} else {
OPAL_OUTPUT((0, "%s:%d: dropping duplicate fragment\n"));
}
break;
}
case MCA_PML_DR_HDR_TYPE_MATCH_ACK:
{
if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_ack_hdr_t))) {
assert(0);
return;
}
if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) {
assert(0);
return;
}
MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_ack_hdr_t);
mca_pml_dr_send_request_match_ack(btl, &hdr->hdr_ack);
break;
}
case MCA_PML_DR_HDR_TYPE_RNDV:
{
if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t))) {
assert(0);
return;
}
if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) {
assert(0);
return;
}
hdr_size = mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type);
csum = OMPI_CSUM((void*)((unsigned char*)segments->seg_addr.pval + hdr_size),
segments->seg_len - hdr_size);
if(csum != hdr->hdr_match.hdr_csum) {
/* drop it on the floor */
assert(0);
return;
}
MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_rendezvous_hdr_t);
MCA_PML_DR_RECV_FRAG_CHECK_DUP(hdr, duplicate);
if(duplicate) {
assert(0);
return;
} else {
if(false == duplicate) {
mca_pml_dr_recv_frag_match(btl, &hdr->hdr_match, segments,des->des_dst_cnt);
} else {
OPAL_OUTPUT((0, "%s:%d: dropping duplicate fragment\n"));
}
break;
}
case MCA_PML_DR_HDR_TYPE_RNDV_ACK:
{
if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_ack_hdr_t))) {
assert(0);
return;
}
if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) {
assert(0);
return;
}
MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_ack_hdr_t);
mca_pml_dr_send_request_rndv_ack(btl, &hdr->hdr_ack);
break;
}
@ -168,14 +139,7 @@ void mca_pml_dr_recv_frag_callback(
mca_pml_dr_recv_request_t* recvreq;
mca_pml_dr_comm_proc_t* comm_proc;
if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_frag_hdr_t))) {
assert(0);
return;
}
if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) {
assert(0);
return;
}
MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_frag_hdr_t);
recvreq = hdr->hdr_frag.hdr_dst_ptr.pval;
comm_proc = recvreq->req_recv.req_base.req_comm->c_pml_comm->procs +
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
@ -186,8 +150,6 @@ void mca_pml_dr_recv_frag_callback(
&hdr->hdr_common,
hdr->hdr_frag.hdr_src_ptr,
~(uint64_t) 0);
assert(0);
return;
} else {
mca_pml_dr_recv_request_progress(recvreq,btl,segments,des->des_dst_cnt);
}
@ -196,14 +158,7 @@ void mca_pml_dr_recv_frag_callback(
}
case MCA_PML_DR_HDR_TYPE_FRAG_ACK:
{
if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_ack_hdr_t))) {
assert(0);
return;
}
if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) {
assert(0);
return;
}
MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_ack_hdr_t);
mca_pml_dr_send_request_frag_ack(btl, &hdr->hdr_ack);
break;
}
@ -516,8 +471,8 @@ bool mca_pml_dr_recv_frag_match(
opal_list_t additional_matches;
ompi_proc_t* ompi_proc;
int rc;
uint32_t csum = 1;
size_t hdr_size;
uint32_t csum = OPAL_CSUM_ZERO;
/* communicator pointer */
comm_ptr=ompi_comm_lookup(hdr->hdr_common.hdr_ctx);
comm=(mca_pml_dr_comm_t *)comm_ptr->c_pml_comm;
@ -592,25 +547,23 @@ rematch:
/* if no match found, verify csum, if pass place on unexpected queue */
mca_pml_dr_recv_frag_t* frag;
/* nack immediately if need be */
hdr_size = mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type);
csum = OMPI_CSUM((void*)((unsigned char*)segments->seg_addr.pval + hdr_size),
segments->seg_len - hdr_size);
if(csum != hdr->hdr_csum) {
mca_pml_dr_recv_frag_send_ack(ompi_proc,
&hdr->hdr_common,
hdr->hdr_src_ptr,
0);
assert(0);
return false;
}
MCA_PML_DR_RECV_FRAG_ALLOC(frag, rc);
if(OMPI_SUCCESS != rc) {
OPAL_THREAD_UNLOCK(&comm->matching_lock);
return rc;
}
MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl);
MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl,csum);
if(csum != hdr->hdr_csum) {
mca_pml_dr_recv_frag_send_ack(ompi_proc,
&hdr->hdr_common,
hdr->hdr_src_ptr,
0);
opal_output(0, "%s:%d: corrupted data 0x%08x != 0x%08x\n",
__FILE__, __LINE__, csum, hdr->hdr_csum);
MCA_PML_DR_RECV_FRAG_RETURN(frag);
OPAL_THREAD_UNLOCK(&comm->matching_lock);
return false;
}
opal_list_append( &proc->unexpected_frags, (opal_list_item_t *)frag );
}
@ -631,25 +584,23 @@ rematch:
*/
mca_pml_dr_recv_frag_t* frag;
hdr_size = mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type);
csum = OMPI_CSUM((void*)((unsigned char*)segments->seg_addr.pval + hdr_size),
segments->seg_len - hdr_size);
if(csum != hdr->hdr_csum) {
mca_pml_dr_recv_frag_send_ack(ompi_proc,
&hdr->hdr_common,
hdr->hdr_src_ptr,
0);
assert(0);
return false;
}
MCA_PML_DR_RECV_FRAG_ALLOC(frag, rc);
if(OMPI_SUCCESS != rc) {
OPAL_THREAD_UNLOCK(&comm->matching_lock);
return rc;
}
MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl);
MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl,csum);
if(csum != hdr->hdr_csum) {
mca_pml_dr_recv_frag_send_ack(ompi_proc,
&hdr->hdr_common,
hdr->hdr_src_ptr,
0);
opal_output(0, "%s:%d: corrupted data 0x%08x != 0x%08x\n",
__FILE__, __LINE__, csum, hdr->hdr_csum);
MCA_PML_DR_RECV_FRAG_RETURN(frag);
OPAL_THREAD_UNLOCK(&comm->matching_lock);
return false;
}
opal_list_append(&proc->frags_cant_match, (opal_list_item_t *)frag);
}
OPAL_THREAD_UNLOCK(&comm->matching_lock);

Просмотреть файл

@ -88,9 +88,11 @@ do { \
} while(0)
#define MCA_PML_DR_RECV_FRAG_INIT(frag,oproc,hdr,segs,cnt,btl) \
#define MCA_PML_DR_RECV_FRAG_INIT(frag,oproc,hdr,segs,cnt,btl,csum) \
do { \
size_t i; \
uint32_t ui1 = 0; \
uint32_t ui2 = 0; \
mca_btl_base_segment_t* macro_segments = frag->segments; \
mca_pml_dr_buffer_t** buffers = frag->buffers; \
\
@ -110,9 +112,13 @@ do { \
buffers[i] = buff; \
macro_segments[i].seg_addr.pval = buff->addr; \
macro_segments[i].seg_len = segs[i].seg_len; \
memcpy(buff->addr, \
csum += OPAL_CSUM_BCOPY_PARTIAL( \
segs[i].seg_addr.pval, \
segs[i].seg_len); \
buff->addr, \
segs[i].seg_len, \
segs[i].seg_len, \
&ui1, \
&ui2); \
} \
} while(0)

Просмотреть файл

@ -30,6 +30,32 @@
#include "ompi/mca/bml/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#define MCA_PML_DR_RECV_REQUEST_ACK(recvreq,hdr,csum) \
if(csum != hdr->hdr_match.hdr_csum) { \
/* failed the csum, put the request back on the list for \
* matching later on retransmission \
*/ \
if(recvreq->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { \
mca_pml_dr_recv_request_match_wild(recvreq); \
} else { \
mca_pml_dr_recv_request_match_specific(recvreq); \
} \
mca_pml_dr_recv_frag_send_ack(comm_proc->ompi_proc, \
&hdr->hdr_common, \
hdr->hdr_match.hdr_src_ptr, \
0); \
opal_output(0, "%s:%d: [rank %d -> rank %d] " \
"data checksum failed 0x%08x != 0x%08x\n", \
__FILE__, __LINE__, \
hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst, \
csum, hdr->hdr_match.hdr_csum); \
bytes_received = bytes_delivered = 0; \
} else { \
mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_match, 1); \
}
static mca_pml_dr_recv_frag_t* mca_pml_dr_recv_request_match_specific_proc(
mca_pml_dr_recv_request_t* request, mca_pml_dr_comm_proc_t* proc);
@ -288,23 +314,7 @@ static void mca_pml_dr_recv_request_vfrag_ack(
bytes_received,
bytes_delivered,
csum);
if(csum != hdr->hdr_match.hdr_csum) {
/* failed the csum, put the request
back on the list for matching later
at retransmission */
if(recvreq->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) {
mca_pml_dr_recv_request_match_wild(recvreq);
} else {
mca_pml_dr_recv_request_match_specific(recvreq);
}
mca_pml_dr_recv_frag_send_ack(comm_proc->ompi_proc,
&hdr->hdr_common,
hdr->hdr_match.hdr_src_ptr,
0);
assert(0);
return;
}
mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_match, 1);
MCA_PML_DR_RECV_REQUEST_ACK(recvreq,hdr,csum);
break;
case MCA_PML_DR_HDR_TYPE_RNDV:
@ -322,23 +332,7 @@ static void mca_pml_dr_recv_request_vfrag_ack(
bytes_received,
bytes_delivered,
csum);
if(csum != hdr->hdr_match.hdr_csum) {
/* failed the csum, put the request
back on the list for matching later
at retransmission */
if(recvreq->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) {
mca_pml_dr_recv_request_match_wild(recvreq);
} else {
mca_pml_dr_recv_request_match_specific(recvreq);
}
mca_pml_dr_recv_frag_send_ack(comm_proc->ompi_proc,
&hdr->hdr_common,
hdr->hdr_match.hdr_src_ptr,
0);
assert(0);
return;
}
mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_match, 1);
MCA_PML_DR_RECV_REQUEST_ACK(recvreq,hdr,csum);
break;
case MCA_PML_DR_HDR_TYPE_FRAG:

Просмотреть файл

@ -24,7 +24,6 @@
#include "ompi_config.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/pml/base/pml_base_recvreq.h"
#include "ompi/datatype/datatype_checksum.h"
#include "pml_dr.h"
#include "pml_dr_proc.h"
@ -230,6 +229,7 @@ do {
(request)->req_recv.req_base.req_datatype, \
(request)->req_recv.req_base.req_count, \
(request)->req_recv.req_base.req_addr, \
CONVERTOR_WITH_CHECKSUM, \
&(request)->req_recv.req_convertor ); \
} else { \
(request)->req_proc = NULL; \
@ -281,7 +281,7 @@ do {
csum = request->req_recv.req_convertor.checksum; \
} else { \
bytes_delivered = 0; \
csum = OMPI_CSUM_ZERO; \
csum = OPAL_CSUM_ZERO; \
} \
} while (0)

Просмотреть файл

@ -429,7 +429,7 @@ int mca_pml_dr_send_request_start_copy(
hdr->hdr_common.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence;
hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OMPI_CSUM_ZERO;
hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OPAL_CSUM_ZERO;
hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0;
hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id;
hdr->hdr_common.hdr_csum = opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t));
@ -499,7 +499,7 @@ int mca_pml_dr_send_request_start_prepare(
hdr->hdr_common.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence;
hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OMPI_CSUM_ZERO;
hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OPAL_CSUM_ZERO;
hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0;
hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id;
hdr->hdr_common.hdr_csum = opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t));
@ -573,7 +573,7 @@ int mca_pml_dr_send_request_start_rndv(
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence;
hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0;
hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OMPI_CSUM_ZERO;
hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OPAL_CSUM_ZERO;
hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_common.hdr_csum = opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t));

Просмотреть файл

@ -88,25 +88,51 @@ OBJ_CLASS_DECLARATION(mca_pml_dr_send_request_t);
#define MCA_PML_DR_SEND_REQUEST_INIT( \
sendreq, \
buf, \
addr, \
count, \
datatype, \
dst, \
peer, \
tag, \
comm, \
sendmode, \
persistent) \
{ \
MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
buf, \
count, \
datatype, \
dst, \
tag, \
comm, \
sendmode, \
persistent); \
}
do { \
/* increment reference counts */ \
OBJ_RETAIN(comm); \
OBJ_RETAIN(datatype); \
\
OMPI_REQUEST_INIT(&(sendreq)->req_send.req_base.req_ompi, persistent); \
(sendreq)->req_send.req_addr = addr; \
(sendreq)->req_send.req_count = count; \
(sendreq)->req_send.req_datatype = datatype; \
(sendreq)->req_send.req_send_mode = sendmode; \
(sendreq)->req_send.req_base.req_addr = addr; \
(sendreq)->req_send.req_base.req_count = count; \
(sendreq)->req_send.req_base.req_datatype = datatype; \
(sendreq)->req_send.req_base.req_peer = (int32_t)peer; \
(sendreq)->req_send.req_base.req_tag = (int32_t)tag; \
(sendreq)->req_send.req_base.req_comm = comm; \
(sendreq)->req_send.req_base.req_pml_complete = (persistent ? true : false); \
(sendreq)->req_send.req_base.req_free_called = false; \
(sendreq)->req_send.req_base.req_ompi.req_status._cancelled = 0; \
\
/* initialize datatype convertor for this request */ \
if(count > 0) { \
/* We will create a convertor specialized for the */ \
/* remote architecture and prepared with the datatype. */ \
ompi_convertor_copy_and_prepare_for_send( \
(sendreq)->req_send.req_base.req_proc->proc_convertor, \
(sendreq)->req_send.req_base.req_datatype, \
(sendreq)->req_send.req_base.req_count, \
(sendreq)->req_send.req_base.req_addr, \
CONVERTOR_WITH_CHECKSUM, \
&(sendreq)->req_send.req_convertor ); \
ompi_convertor_get_packed_size(&(sendreq)->req_send.req_convertor, \
&((sendreq)->req_send.req_bytes_packed) ); \
} else { \
(sendreq)->req_send.req_bytes_packed = 0; \
} \
} while(0)
/**

Просмотреть файл

@ -90,7 +90,7 @@ void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* data) {
OPAL_THREAD_LOCK(&ompi_request_lock);
vfrag->vf_send_cnt++;
if(vfrag->vf_send_cnt > mca_pml_dr.timer_ack_max_count) {
opal_output(0, "ack retry count exceeded! %s:%d FATAL", __FILE__, __LINE__);
opal_output(0, "%s:%d: maximum ack retry count exceeded: FATAL", __FILE__, __LINE__);
orte_errmgr.abort();
}
vfrag->vf_idx = 1;

Просмотреть файл

@ -53,6 +53,7 @@ mca_pml_ob1_t mca_pml_ob1 = {
mca_pml_ob1_iprobe,
mca_pml_ob1_probe,
mca_pml_ob1_start,
mca_pml_ob1_dump,
32768,
INT_MAX
}
@ -176,3 +177,29 @@ int mca_pml_ob1_component_fini(void)
return OMPI_SUCCESS;
}
/*
* diagnostics
*/
void mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose)
{
struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
size_t i;
/* iterate through all procs on communicator */
for(i=0; i<pml_comm->num_procs; i++) {
mca_pml_ob1_comm_proc_t* proc = &pml_comm->procs[i];
mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_ompi->proc_pml;
size_t n;
opal_output(0, "[Rank %d]\n", i);
/* dump all receive queues */
/* dump all btls */
for(n=0; n<ep->btl_eager.arr_size; n++) {
mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[i];
bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose);
}
}
}

Просмотреть файл

@ -200,6 +200,11 @@ extern int mca_pml_ob1_recv(
ompi_status_public_t* status
);
extern void mca_pml_ob1_dump(
struct ompi_communicator_t* comm,
int verbose
);
extern int mca_pml_ob1_progress(void);
extern int mca_pml_ob1_start(

Просмотреть файл

@ -213,6 +213,7 @@ do {
(request)->req_recv.req_base.req_datatype, \
(request)->req_recv.req_base.req_count, \
(request)->req_recv.req_base.req_addr, \
0, \
&(request)->req_recv.req_convertor ); \
ompi_convertor_get_unpacked_size( &(request)->req_recv.req_convertor, \
&(request)->req_bytes_delivered ); \

Просмотреть файл

@ -829,6 +829,7 @@ int mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq)
sendreq->req_send.req_base.req_datatype,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_addr,
0,
&convertor);
ompi_convertor_set_position(&convertor, &position);
OBJ_DESTRUCT( &convertor );

Просмотреть файл

@ -443,6 +443,18 @@ typedef int (*mca_pml_base_module_null_fn_t)(
struct ompi_request_t** request
);
/**
* Diagnostics function.
*
* @param request (IN) Communicator
* @return OMPI_SUCCESS or failure status.
*
*/
typedef int (*mca_pml_base_module_dump_fn_t)(
struct ompi_communicator_t* comm,
int verbose
);
/**
* PML instance.
@ -469,6 +481,9 @@ struct mca_pml_base_module_1_0_0_t {
mca_pml_base_module_probe_fn_t pml_probe;
mca_pml_base_module_start_fn_t pml_start;
/* diagnostics */
mca_pml_base_module_dump_fn_t pml_dump;
/* maximum constant sizes */
int pml_max_contextid;
int pml_max_tag;

Просмотреть файл

@ -62,7 +62,7 @@ int MPI_Pack(void *inbuf, int incount, MPI_Datatype datatype,
OBJ_CONSTRUCT( &local_convertor, ompi_convertor_t );
/* the resulting convertor will be set to the position ZERO */
ompi_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, datatype,
incount, inbuf, &local_convertor );
incount, inbuf, 0, &local_convertor );
/* Check for truncation */
ompi_convertor_get_packed_size( &local_convertor, &size );

Просмотреть файл

@ -58,7 +58,7 @@ int MPI_Pack_external(char *datarep, void *inbuf, int incount,
/* The resulting convertor will be set to the position zero */
ompi_convertor_copy_and_prepare_for_send( ompi_mpi_external32_convertor,
datatype, incount, inbuf, &local_convertor );
datatype, incount, inbuf, 0, &local_convertor );
/* Check for truncation */
ompi_convertor_get_packed_size( &local_convertor, &size );

Просмотреть файл

@ -49,7 +49,7 @@ int MPI_Pack_external_size(char *datarep, int incount,
}
/* the resulting convertor will be set to the position ZERO */
ompi_convertor_copy_and_prepare_for_send( ompi_mpi_external32_convertor,
datatype, incount, NULL, &local_convertor );
datatype, incount, NULL, 0, &local_convertor );
ompi_convertor_get_packed_size( &local_convertor, &length );
*size = (MPI_Aint)length;

Просмотреть файл

@ -52,7 +52,7 @@ int MPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm,
OBJ_CONSTRUCT( &local_convertor, ompi_convertor_t );
/* the resulting convertor will be set to the position ZERO */
ompi_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, datatype, incount, NULL, &local_convertor );
ompi_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, datatype, incount, NULL, 0, &local_convertor );
ompi_convertor_get_packed_size( &local_convertor, &length );
*size = (int)length;

Просмотреть файл

@ -81,7 +81,7 @@ int MPI_Sendrecv_replace(void * buf, int count, MPI_Datatype datatype,
/* initialize convertor to unpack recv buffer */
OBJ_CONSTRUCT(&convertor, ompi_convertor_t);
ompi_convertor_copy_and_prepare_for_recv( proc->proc_convertor, datatype,
count, buf, &convertor );
count, buf, 0, &convertor );
/* setup a buffer for recv */
ompi_convertor_get_packed_size( &convertor, &packed_size );

Просмотреть файл

@ -65,7 +65,7 @@ int MPI_Unpack(void *inbuf, int insize, int *position,
if( insize > 0 ) {
OBJ_CONSTRUCT( &local_convertor, ompi_convertor_t );
/* the resulting convertor will be set the the position ZERO */
ompi_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor, datatype, outcount, outbuf, &local_convertor );
ompi_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor, datatype, outcount, outbuf, 0, &local_convertor );
/* Check for truncation */
ompi_convertor_get_packed_size( &local_convertor, &size );

Просмотреть файл

@ -55,7 +55,7 @@ int MPI_Unpack_external (char *datarep, void *inbuf, MPI_Aint insize,
/* the resulting convertor will be set to the position ZERO */
ompi_convertor_copy_and_prepare_for_send( ompi_mpi_external32_convertor,
datatype, outcount, NULL, &local_convertor );
datatype, outcount, NULL, 0, &local_convertor );
/* Check for truncation */
ompi_convertor_get_packed_size( &local_convertor, &size );

Просмотреть файл

@ -22,6 +22,15 @@
#define CRC_POLYNOMIAL ((unsigned int)0x04c11db7)
#define CRC_INITIAL_REGISTER ((unsigned int)0xffffffff)
#define OPAL_CSUM( SRC, LEN ) opal_uicsum( SRC, LEN )
#define OPAL_CSUM_PARTIAL( SRC, LEN, UI1, UI2 ) \
opal_uicsum_partial( SRC, LEN, UI1, UI2 )
#define OPAL_CSUM_BCOPY_PARTIAL( SRC, DST, LEN1, LEN2, UI1, UI2 ) \
opal_bcopy_uicsum_partial( SRC, DST, LEN1, LEN2, UI1, UI2 )
#define OPAL_CSUM_ZERO 0
unsigned long
opal_bcopy_csum_partial(
const void * source,