diff --git a/ompi/class/ompi_free_list.c b/ompi/class/ompi_free_list.c index 12bacdeb9c..7f3af85034 100644 --- a/ompi/class/ompi_free_list.c +++ b/ompi/class/ompi_free_list.c @@ -62,7 +62,7 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl) { opal_list_item_t *item; -#if OMPI_ENABLE_DEBUG +#if 0 && OMPI_ENABLE_DEBUG if(opal_list_get_size(&fl->super) != fl->fl_num_allocated) { opal_output(0, "ompi_free_list: %d allocated %d returned: %s:%d\n", fl->fl_num_allocated, opal_list_get_size(&fl->super), diff --git a/ompi/datatype/convertor.c b/ompi/datatype/convertor.c index 06e794d444..3a6f54f6e7 100644 --- a/ompi/datatype/convertor.c +++ b/ompi/datatype/convertor.c @@ -87,7 +87,7 @@ inline int32_t ompi_convertor_pack( ompi_convertor_t* pConv, struct iovec* iov, uint32_t* out_size, size_t* max_data, int32_t* freeAfter ) { - pConv->checksum = OMPI_CSUM_ZERO; + pConv->checksum = OPAL_CSUM_ZERO; pConv->csum_ui1 = 0; pConv->csum_ui2 = 0; @@ -111,7 +111,7 @@ inline int32_t ompi_convertor_unpack( ompi_convertor_t* pConv, struct iovec* iov, uint32_t* out_size, size_t* max_data, int32_t* freeAfter ) { - pConv->checksum = OMPI_CSUM_ZERO; + pConv->checksum = OPAL_CSUM_ZERO; pConv->csum_ui1 = 0; pConv->csum_ui2 = 0; diff --git a/ompi/datatype/convertor.h b/ompi/datatype/convertor.h index f5d9a13af8..36bb0da713 100644 --- a/ompi/datatype/convertor.h +++ b/ompi/datatype/convertor.h @@ -210,11 +210,12 @@ ompi_convertor_copy_and_prepare_for_send( const ompi_convertor_t* pSrcConv, const struct ompi_datatype_t* datatype, int32_t count, const void* pUserBuf, + int32_t flags, ompi_convertor_t* convertor ) { convertor->remoteArch = pSrcConv->remoteArch; convertor->pFunctions = pSrcConv->pFunctions; - convertor->flags = pSrcConv->flags & ~CONVERTOR_STATE_MASK; + convertor->flags = (pSrcConv->flags | flags) & ~CONVERTOR_STATE_MASK; return ompi_convertor_prepare_for_send( convertor, datatype, count, pUserBuf ); } @@ -232,11 +233,12 @@ ompi_convertor_copy_and_prepare_for_recv( const ompi_convertor_t* pSrcConv, const struct ompi_datatype_t* datatype, int32_t count, const void* pUserBuf, + int32_t flags, ompi_convertor_t* convertor ) { convertor->remoteArch = pSrcConv->remoteArch; convertor->pFunctions = pSrcConv->pFunctions; - convertor->flags = pSrcConv->flags & ~CONVERTOR_STATE_MASK; + convertor->flags = (pSrcConv->flags | flags) & ~CONVERTOR_STATE_MASK; return ompi_convertor_prepare_for_recv( convertor, datatype, count, pUserBuf ); } diff --git a/ompi/datatype/datatype_checksum.h b/ompi/datatype/datatype_checksum.h index a5ac836c8d..529eaf5de3 100644 --- a/ompi/datatype/datatype_checksum.h +++ b/ompi/datatype/datatype_checksum.h @@ -15,78 +15,28 @@ #ifndef DATATYPE_CHECKSUM_H_HAS_BEEN_INCLUDED #define DATATYPE_CHECKSUM_H_HAS_BEEN_INCLUDED -#define OMPI_CSUM_ZERO 0 -#define OMPI_REQUIRE_DATA_VALIDATION 0 - -#if OMPI_REQUIRE_DATA_VALIDATION +#include "ompi/datatype/datatype_memcpy.h" #include "opal/util/crc.h" +#if defined(CHECKSUM) + #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ do { \ - /*opal_output( 0, "memcpy dest = %p src = %p length = %d\n", (void*)(DST), (void*)(SRC), (int)(BLENGTH) ); */\ - (CONVERTOR)->checksum += opal_bcopy_uicsum_partial( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \ + (CONVERTOR)->checksum += OPAL_CSUM_BCOPY_PARTIAL( (SRC), (DST), (BLENGTH), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \ } while (0) #define COMPUTE_CSUM( SRC, BLENGTH, CONVERTOR ) \ do { \ - /*opal_output( 0, "memcpy dest = %p src = %p length = %d\n", (void*)(DST), (void*)(SRC), (int)(BLENGTH) ); */\ - (CONVERTOR)->checksum += opal_uicsum_partial( (SRC), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \ + (CONVERTOR)->checksum += OPAL_CSUM_PARTIAL( (SRC), (BLENGTH), &(CONVERTOR)->csum_ui1, &(CONVERTOR)->csum_ui2 ); \ } while (0) -#define OMPI_CSUM( SRC, BLENGTH ) \ - opal_uicsum( (SRC), (BLENGTH) ) - #else -#include "ompi/datatype/datatype_memcpy.h" - #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ MEMCPY( (DST), (SRC), (BLENGTH) ) #define COMPUTE_CSUM( SRC, BLENGTH, CONVERTOR ) -#define OMPI_CSUM( SRC, BLENGTH ) OMPI_CSUM_ZERO - #endif - -/* ADLER_NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ -#define ADLER_NMAX 5551 -#define MOD_ADLER 65521 - -#if OMPI_REQUIRE_DATA_VALIDATION - -#define DO1(buf,i) {_a += buf[i]; _b += _a;} -#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); -#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); -#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); -#define DO16(buf) DO8(buf,0); DO8(buf,8); - -#define COMPUTE_SPECIFIC_CHECKSUM( DATA, LENGTH, ADLER32) \ -do { \ - uint8_t *_data = (DATA); /* Pointer to the data to be summed */ \ - size_t _len = (LENGTH); /* Length in bytes */ \ - uint32_t _a = (ADLER32) & 0xffff, \ - _b = ((ADLER32) >> 16) & 0xffff; \ -\ - while( _len > 0 ) { \ - unsigned _tlen = _len > ADLER_NMAX ? ADLER_NMAX : _len; \ - _len -= _tlen; \ - while( _tlen >= 16 ) { \ - DO16(_data); \ - _data += 16; \ - _tlen -= 16; \ - } \ - if( 0 != _tlen ) do { \ - _a += *_data++; _b += _a; \ - } while( --_tlen > 0 ); \ - _a = _a % MOD_ADLER; \ - _b = _b % MOD_ADLER; \ - } \ - (ADLER32) = _b << 16 | _a; \ -} while(0) -#else -#define COMPUTE_SPECIFIC_CHECKSUM( DATA, LENGTH, ADLER32 ) -#endif /* OMPI_REQUIRE_DATA_VALIDATION */ - #endif /* DATATYPE_CHECKSUM_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index 5398bb4802..df6aab1ad1 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -35,6 +35,8 @@ #include "ompi/class/ompi_free_list.h" #include "ompi/mca/pml/pml.h" +#define OMPI_ENABLE_DEBUG_RELIABILITY 0 + /* * BML types */ @@ -236,7 +238,60 @@ static inline void mca_bml_base_free(mca_bml_base_btl_t* bml_btl, mca_btl_base_d */ } -static inline int mca_bml_base_send(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des, mca_btl_base_tag_t tag) { +#if OMPI_ENABLE_DEBUG_RELIABILITY + + +struct mca_bml_base_context_t { + size_t index; + mca_btl_base_completion_fn_t cbfunc; + void* cbdata; +}; +typedef struct mca_bml_base_context_t mca_bml_base_context_t; + + +static inline void mca_bml_base_completion( + struct mca_btl_base_module_t*, + struct mca_btl_base_endpoint_t*, + struct mca_btl_base_descriptor_t*, + int status) +{ + mca_bml_base_context_t* ctx = des->des_cbdata; + /* restore original state */ + ((unsigned char*)des->des_src[0].seg_addr.pval)[ctx->index] ^= ~0; + des->des_cbdata = ctx->cbdata; + des->des_cbfunc = ctx->cbfunc; + free(ctx); + /* invoke original callback */ + des->des_cbfunc(btl,ep,des,status); +} + + +static inline int mca_bml_base_send( + mca_bml_base_btl_t* bml_btl, + mca_btl_base_descriptor_t* des, + mca_btl_base_tag_t tag) +{ + static int count; + if(count <= 0) { + count = ((1000.0 * rand())/(RAND_MAX+1.0)); + if(count % 2) { + /* local completion - network "drops" packet */ + des->des_cbfunc(bml_btl->btl, bml_btl->btl_endpoint, des, OMPI_SUCCESS); + return OMPI_SUCCESS; + } else { + /* corrupt data */ + mca_bml_base_context_t* ctx = malloc(sizeof(mca_bml_base_context_t)); + if(NULL != ctx) { + ctx->index = (des->des_src[0].seg_len * rand() * 1.0) / (RAND_MAX + 1.0); + ctx->cbfunc = des->des_cbfunc; + ctx->cbdata = des->des_cbdata; + ((unsigned char*)des->des_src[0].seg_addr.pval)[ctx->index] ^= ~0; + des->des_cbdata = ctx; + des->des_cbfunc = mca_bml_base_completion; + } + } + } + count--; des->des_context = (void*) bml_btl; return bml_btl->btl_send( bml_btl->btl, @@ -245,6 +300,23 @@ static inline int mca_bml_base_send(mca_bml_base_btl_t* bml_btl, mca_btl_base_de tag); } +#else + +static inline int mca_bml_base_send( + mca_bml_base_btl_t* bml_btl, + mca_btl_base_descriptor_t* des, + mca_btl_base_tag_t tag) +{ + des->des_context = (void*) bml_btl; + return bml_btl->btl_send( + bml_btl->btl, + bml_btl->btl_endpoint, + des, + tag); +} + +#endif + static inline int mca_bml_base_put(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) { des->des_context = (void*) bml_btl; return bml_btl->btl_put( diff --git a/ompi/mca/btl/base/base.h b/ompi/mca/btl/base/base.h index f0c66d4ff7..ece4880701 100644 --- a/ompi/mca/btl/base/base.h +++ b/ompi/mca/btl/base/base.h @@ -57,7 +57,10 @@ OBJ_CLASS_DECLARATION(mca_btl_base_selected_module_t); OMPI_DECLSPEC int mca_btl_base_open(void); OMPI_DECLSPEC int mca_btl_base_select(bool enable_progress_threads, bool enable_mpi_threads); OMPI_DECLSPEC int mca_btl_base_close(void); -OMPI_DECLSPEC void mca_btl_base_dump(mca_btl_base_module_t*); +OMPI_DECLSPEC void mca_btl_base_dump( + struct mca_btl_base_module_t*, + struct mca_btl_base_endpoint_t*, + int verbose); diff --git a/ompi/mca/btl/base/btl_base_error.c b/ompi/mca/btl/base/btl_base_error.c index cfaeadbab7..50b7b910e4 100644 --- a/ompi/mca/btl/base/btl_base_error.c +++ b/ompi/mca/btl/base/btl_base_error.c @@ -66,6 +66,9 @@ void mca_btl_base_error_no_nics(const char* transport, } -void mca_btl_base_dump(mca_btl_base_module_t* btl) +void mca_btl_base_dump( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose) { } diff --git a/ompi/mca/btl/btl.h b/ompi/mca/btl/btl.h index d3f9aebba4..f4483296dc 100644 --- a/ompi/mca/btl/btl.h +++ b/ompi/mca/btl/btl.h @@ -489,7 +489,9 @@ typedef int (*mca_btl_base_module_get_fn_t)( */ typedef void (*mca_btl_base_module_dump_fn_t)( - struct mca_btl_base_module_t* btl + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose ); diff --git a/ompi/mca/btl/mvapi/btl_mvapi.c b/ompi/mca/btl/mvapi/btl_mvapi.c index 99e56c96dc..a5a036247a 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.c +++ b/ompi/mca/btl/mvapi/btl_mvapi.c @@ -853,7 +853,10 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) * Dump state of btl/queues */ -void mca_btl_mvapi_dump(mca_btl_base_module_t* btl) +void mca_btl_mvapi_dump( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose) { mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*)btl; diff --git a/ompi/mca/btl/mvapi/btl_mvapi.h b/ompi/mca/btl/mvapi/btl_mvapi.h index fc41369739..317320b833 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.h +++ b/ompi/mca/btl/mvapi/btl_mvapi.h @@ -488,6 +488,17 @@ extern void mca_btl_mvapi_send_frag_return( ); +/* + * Dump state of btl/queues + */ + +extern void mca_btl_mvapi_dump( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose +); + + int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t* mvapi_btl); diff --git a/ompi/mca/coll/sm/coll_sm_bcast.c b/ompi/mca/coll/sm/coll_sm_bcast.c index 49dad1d4ae..1ce9057432 100644 --- a/ompi/mca/coll/sm/coll_sm_bcast.c +++ b/ompi/mca/coll/sm/coll_sm_bcast.c @@ -100,6 +100,7 @@ int mca_coll_sm_bcast_intra(void *buff, int count, datatype, count, buff, + 0, &convertor))) { return ret; } @@ -158,6 +159,7 @@ int mca_coll_sm_bcast_intra(void *buff, int count, datatype, count, buff, + 0, &convertor))) { return ret; } diff --git a/ompi/mca/coll/sm/coll_sm_reduce.c b/ompi/mca/coll/sm/coll_sm_reduce.c index 17f790ec90..9e928bcb43 100644 --- a/ompi/mca/coll/sm/coll_sm_reduce.c +++ b/ompi/mca/coll/sm/coll_sm_reduce.c @@ -227,6 +227,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count, dtype, segment_ddt_count, reduce_temp_buffer, + 0, &convertor))) { free(free_buffer); return ret; @@ -426,6 +427,7 @@ static int reduce_inorder(void *sbuf, void* rbuf, int count, dtype, count, sbuf, + 0, &convertor))) { return ret; } diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c index 469a50bf3c..49e9c208c5 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c @@ -508,6 +508,7 @@ ompi_osc_pt2pt_sendreq_recv_put(ompi_osc_pt2pt_module_t *module, datatype, header->hdr_target_count, target, + 0, &convertor); iov.iov_len = header->hdr_msg_length; iov.iov_base = inbuf; diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_obj_convert.c b/ompi/mca/osc/pt2pt/osc_pt2pt_obj_convert.c index 285a1ff2bc..00f7f64b0d 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_obj_convert.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_obj_convert.c @@ -61,6 +61,7 @@ ompi_osc_pt2pt_process_op(ompi_osc_pt2pt_module_t *module, datatype, header->hdr_target_count, target_buffer, + 0, &convertor); /* short circuit the reduction operation MPI_REPLACE - it just diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h b/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h index ada6c2e410..7dea075062 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h @@ -104,6 +104,7 @@ ompi_osc_pt2pt_replyreq_init_target(ompi_osc_pt2pt_replyreq_t *replyreq, target_dt, target_count, target_addr, + 0, &(replyreq->rep_target_convertor)); ompi_convertor_get_packed_size(&replyreq->rep_target_convertor, &replyreq->rep_target_bytes_packed); diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.h b/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.h index 1b801138e7..aada619d54 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.h +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.h @@ -123,6 +123,7 @@ ompi_osc_pt2pt_sendreq_init_origin(ompi_osc_pt2pt_sendreq_t *sendreq, origin_dt, origin_count, origin_addr, + 0, &(sendreq->req_origin_convertor)); ompi_convertor_get_packed_size(&sendreq->req_origin_convertor, &sendreq->req_origin_bytes_packed); @@ -131,6 +132,7 @@ ompi_osc_pt2pt_sendreq_init_origin(ompi_osc_pt2pt_sendreq_t *sendreq, origin_dt, origin_count, origin_addr, + 0, &(sendreq->req_origin_convertor)); ompi_convertor_get_packed_size(&sendreq->req_origin_convertor, &sendreq->req_origin_bytes_packed); diff --git a/ompi/mca/pml/base/pml_base_sendreq.h b/ompi/mca/pml/base/pml_base_sendreq.h index d4b072573d..c6916bf67e 100644 --- a/ompi/mca/pml/base/pml_base_sendreq.h +++ b/ompi/mca/pml/base/pml_base_sendreq.h @@ -105,6 +105,7 @@ typedef struct mca_pml_base_send_request_t mca_pml_base_send_request_t; (request)->req_base.req_datatype, \ (request)->req_base.req_count, \ (request)->req_base.req_addr, \ + 0, \ &(request)->req_convertor ); \ ompi_convertor_get_packed_size( &(request)->req_convertor, \ &((request)->req_bytes_packed) );\ diff --git a/ompi/mca/pml/dr/pml_dr.c b/ompi/mca/pml/dr/pml_dr.c index a22e5fe9f8..a62d90feb4 100644 --- a/ompi/mca/pml/dr/pml_dr.c +++ b/ompi/mca/pml/dr/pml_dr.c @@ -52,6 +52,7 @@ mca_pml_dr_t mca_pml_dr = { mca_pml_dr_iprobe, mca_pml_dr_probe, mca_pml_dr_start, + mca_pml_dr_dump, 32768, INT_MAX } @@ -173,3 +174,9 @@ int mca_pml_dr_component_fini(void) return OMPI_SUCCESS; } +void mca_pml_dr_dump( + struct ompi_communicator_t* comm, + int verbose) +{ +} + diff --git a/ompi/mca/pml/dr/pml_dr.h b/ompi/mca/pml/dr/pml_dr.h index c691ccd534..8b2c365142 100644 --- a/ompi/mca/pml/dr/pml_dr.h +++ b/ompi/mca/pml/dr/pml_dr.h @@ -25,8 +25,9 @@ #include "ompi_config.h" #include "opal/threads/threads.h" #include "opal/threads/condition.h" -#include "ompi/class/ompi_free_list.h" #include "opal/util/cmd_line.h" +#include "opal/util/crc.h" +#include "ompi/class/ompi_free_list.h" #include "ompi/request/request.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/base/pml_base_request.h" @@ -34,7 +35,6 @@ #include "ompi/mca/pml/base/pml_base_sendreq.h" #include "ompi/mca/btl/btl.h" #include "ompi/datatype/datatype.h" -#include "ompi/datatype/datatype_internal.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { @@ -211,6 +211,11 @@ extern int mca_pml_dr_recv( ompi_status_public_t* status ); +extern void mca_pml_dr_dump( + struct ompi_communicator_t* comm, + int verbose +); + extern int mca_pml_dr_progress(void); extern int mca_pml_dr_start( diff --git a/ompi/mca/pml/dr/pml_dr_component.c b/ompi/mca/pml/dr/pml_dr_component.c index e88b6fd41a..676cdafac7 100644 --- a/ompi/mca/pml/dr/pml_dr_component.c +++ b/ompi/mca/pml/dr/pml_dr_component.c @@ -97,16 +97,16 @@ int mca_pml_dr_component_open(void) mca_pml_dr_param_register_int("timer_wdog_usec", 0); mca_pml_dr.timer_wdog_multiplier = mca_pml_dr_param_register_int("timer_wdog_multiplier", 2); - mca_pml_dr.timer_wdog_multiplier = + mca_pml_dr.timer_wdog_max_count = mca_pml_dr_param_register_int("timer_wdog_max_count", 10); mca_pml_dr.timer_ack_sec = - mca_pml_dr_param_register_int("timer_ack_sec", 1); + mca_pml_dr_param_register_int("timer_ack_sec", 10); mca_pml_dr.timer_ack_usec = mca_pml_dr_param_register_int("timer_ack_usec", 0); mca_pml_dr.timer_ack_multiplier = mca_pml_dr_param_register_int("timer_ack_multiplier", 2); - mca_pml_dr.timer_wdog_multiplier = + mca_pml_dr.timer_ack_max_count = mca_pml_dr_param_register_int("timer_ack_max_count", 10); OBJ_CONSTRUCT(&mca_pml_dr.lock, opal_mutex_t); diff --git a/ompi/mca/pml/dr/pml_dr_recvfrag.c b/ompi/mca/pml/dr/pml_dr_recvfrag.c index 34f9ef123d..5be59ba4a5 100644 --- a/ompi/mca/pml/dr/pml_dr_recvfrag.c +++ b/ompi/mca/pml/dr/pml_dr_recvfrag.c @@ -28,8 +28,6 @@ #include "ompi/constants.h" #include "ompi/communicator/communicator.h" #include "ompi/datatype/datatype.h" -#include "ompi/datatype/datatype_internal.h" -#include "ompi/datatype/datatype_checksum.h" #include "ompi/mca/pml/pml.h" #include "pml_dr.h" #include "pml_dr_comm.h" @@ -39,6 +37,21 @@ #include "pml_dr_hdr.h" +#define MCA_PML_DR_HDR_VALIDATE(hdr, type) \ +do { \ + uint16_t csum = opal_csum(hdr, sizeof(type)); \ + if(hdr->hdr_common.hdr_csum != csum) { \ + opal_output(0, "%s:%d: invalid header checksum: 0x%04x != 0x%04x\n", \ + __FILE__, __LINE__, hdr->hdr_common.hdr_csum, csum); \ + return; \ + } \ + if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { \ + opal_output(0, "%s:%d: misdelivered packet [rank %d -> rank %d]\n", \ + __FILE__, __LINE__, hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst); \ + return; \ + } \ +} while (0) + OBJ_CLASS_INSTANCE( mca_pml_dr_buffer_t, @@ -81,8 +94,6 @@ void mca_pml_dr_recv_frag_callback( { mca_btl_base_segment_t* segments = des->des_dst; mca_pml_dr_hdr_t* hdr = (mca_pml_dr_hdr_t*)segments->seg_addr.pval; - uint32_t csum = 1; - size_t hdr_size; bool duplicate = false; if(segments->seg_len < sizeof(mca_pml_dr_common_hdr_t)) { return; @@ -91,75 +102,35 @@ void mca_pml_dr_recv_frag_callback( switch(hdr->hdr_common.hdr_type) { case MCA_PML_DR_HDR_TYPE_MATCH: { - if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t))) { - assert(0); - return; - } - if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { - assert(0); - return; - } - + MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_match_hdr_t); MCA_PML_DR_RECV_FRAG_CHECK_DUP(hdr, duplicate); - if(duplicate) { - assert(0); - return; - } else { + if(false == duplicate) { mca_pml_dr_recv_frag_match(btl, &hdr->hdr_match, segments,des->des_dst_cnt); + } else { + OPAL_OUTPUT((0, "%s:%d: dropping duplicate fragment\n")); } break; } case MCA_PML_DR_HDR_TYPE_MATCH_ACK: { - if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_ack_hdr_t))) { - assert(0); - return; - } - if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { - assert(0); - return; - } + MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_ack_hdr_t); mca_pml_dr_send_request_match_ack(btl, &hdr->hdr_ack); break; } case MCA_PML_DR_HDR_TYPE_RNDV: { - if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t))) { - assert(0); - return; - } - if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { - assert(0); - return; - } - hdr_size = mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type); - csum = OMPI_CSUM((void*)((unsigned char*)segments->seg_addr.pval + hdr_size), - segments->seg_len - hdr_size); - if(csum != hdr->hdr_match.hdr_csum) { - /* drop it on the floor */ - assert(0); - return; - } - + MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_rendezvous_hdr_t); MCA_PML_DR_RECV_FRAG_CHECK_DUP(hdr, duplicate); - if(duplicate) { - assert(0); - return; - } else { + if(false == duplicate) { mca_pml_dr_recv_frag_match(btl, &hdr->hdr_match, segments,des->des_dst_cnt); + } else { + OPAL_OUTPUT((0, "%s:%d: dropping duplicate fragment\n")); } break; } case MCA_PML_DR_HDR_TYPE_RNDV_ACK: { - if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_ack_hdr_t))) { - assert(0); - return; - } - if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { - assert(0); - return; - } + MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_ack_hdr_t); mca_pml_dr_send_request_rndv_ack(btl, &hdr->hdr_ack); break; } @@ -168,14 +139,7 @@ void mca_pml_dr_recv_frag_callback( mca_pml_dr_recv_request_t* recvreq; mca_pml_dr_comm_proc_t* comm_proc; - if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_frag_hdr_t))) { - assert(0); - return; - } - if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { - assert(0); - return; - } + MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_frag_hdr_t); recvreq = hdr->hdr_frag.hdr_dst_ptr.pval; comm_proc = recvreq->req_recv.req_base.req_comm->c_pml_comm->procs + recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE; @@ -186,8 +150,6 @@ void mca_pml_dr_recv_frag_callback( &hdr->hdr_common, hdr->hdr_frag.hdr_src_ptr, ~(uint64_t) 0); - assert(0); - return; } else { mca_pml_dr_recv_request_progress(recvreq,btl,segments,des->des_dst_cnt); } @@ -196,14 +158,7 @@ void mca_pml_dr_recv_frag_callback( } case MCA_PML_DR_HDR_TYPE_FRAG_ACK: { - if(hdr->hdr_common.hdr_csum != (uint16_t) opal_csum(hdr, sizeof(mca_pml_dr_ack_hdr_t))) { - assert(0); - return; - } - if(hdr->hdr_common.hdr_dst != (ompi_comm_lookup(hdr->hdr_common.hdr_ctx))->c_my_rank ) { - assert(0); - return; - } + MCA_PML_DR_HDR_VALIDATE(hdr, mca_pml_dr_ack_hdr_t); mca_pml_dr_send_request_frag_ack(btl, &hdr->hdr_ack); break; } @@ -516,8 +471,8 @@ bool mca_pml_dr_recv_frag_match( opal_list_t additional_matches; ompi_proc_t* ompi_proc; int rc; - uint32_t csum = 1; - size_t hdr_size; + uint32_t csum = OPAL_CSUM_ZERO; + /* communicator pointer */ comm_ptr=ompi_comm_lookup(hdr->hdr_common.hdr_ctx); comm=(mca_pml_dr_comm_t *)comm_ptr->c_pml_comm; @@ -592,25 +547,23 @@ rematch: /* if no match found, verify csum, if pass place on unexpected queue */ mca_pml_dr_recv_frag_t* frag; - /* nack immediately if need be */ - hdr_size = mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type); - csum = OMPI_CSUM((void*)((unsigned char*)segments->seg_addr.pval + hdr_size), - segments->seg_len - hdr_size); - if(csum != hdr->hdr_csum) { - mca_pml_dr_recv_frag_send_ack(ompi_proc, - &hdr->hdr_common, - hdr->hdr_src_ptr, - 0); - assert(0); - return false; - } - MCA_PML_DR_RECV_FRAG_ALLOC(frag, rc); if(OMPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&comm->matching_lock); return rc; } - MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl); + MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl,csum); + if(csum != hdr->hdr_csum) { + mca_pml_dr_recv_frag_send_ack(ompi_proc, + &hdr->hdr_common, + hdr->hdr_src_ptr, + 0); + opal_output(0, "%s:%d: corrupted data 0x%08x != 0x%08x\n", + __FILE__, __LINE__, csum, hdr->hdr_csum); + MCA_PML_DR_RECV_FRAG_RETURN(frag); + OPAL_THREAD_UNLOCK(&comm->matching_lock); + return false; + } opal_list_append( &proc->unexpected_frags, (opal_list_item_t *)frag ); } @@ -631,25 +584,23 @@ rematch: */ mca_pml_dr_recv_frag_t* frag; - hdr_size = mca_pml_dr_hdr_size(hdr->hdr_common.hdr_type); - csum = OMPI_CSUM((void*)((unsigned char*)segments->seg_addr.pval + hdr_size), - segments->seg_len - hdr_size); - if(csum != hdr->hdr_csum) { - mca_pml_dr_recv_frag_send_ack(ompi_proc, - &hdr->hdr_common, - hdr->hdr_src_ptr, - 0); - assert(0); - return false; - } - - MCA_PML_DR_RECV_FRAG_ALLOC(frag, rc); if(OMPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&comm->matching_lock); return rc; } - MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl); + MCA_PML_DR_RECV_FRAG_INIT(frag,proc->ompi_proc,hdr,segments,num_segments,btl,csum); + if(csum != hdr->hdr_csum) { + mca_pml_dr_recv_frag_send_ack(ompi_proc, + &hdr->hdr_common, + hdr->hdr_src_ptr, + 0); + opal_output(0, "%s:%d: corrupted data 0x%08x != 0x%08x\n", + __FILE__, __LINE__, csum, hdr->hdr_csum); + MCA_PML_DR_RECV_FRAG_RETURN(frag); + OPAL_THREAD_UNLOCK(&comm->matching_lock); + return false; + } opal_list_append(&proc->frags_cant_match, (opal_list_item_t *)frag); } OPAL_THREAD_UNLOCK(&comm->matching_lock); diff --git a/ompi/mca/pml/dr/pml_dr_recvfrag.h b/ompi/mca/pml/dr/pml_dr_recvfrag.h index 753f0ee10c..93a31d958d 100644 --- a/ompi/mca/pml/dr/pml_dr_recvfrag.h +++ b/ompi/mca/pml/dr/pml_dr_recvfrag.h @@ -80,40 +80,46 @@ do { \ } \ } while (0) -#define MCA_PML_DR_RECV_FRAG_ALLOC(frag,rc) \ -do { \ - opal_list_item_t* item; \ - OMPI_FREE_LIST_WAIT(&mca_pml_dr.recv_frags, item, rc); \ - frag = (mca_pml_dr_recv_frag_t*)item; \ +#define MCA_PML_DR_RECV_FRAG_ALLOC(frag,rc) \ +do { \ + opal_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_pml_dr.recv_frags, item, rc); \ + frag = (mca_pml_dr_recv_frag_t*)item; \ } while(0) -#define MCA_PML_DR_RECV_FRAG_INIT(frag,oproc,hdr,segs,cnt,btl) \ -do { \ - size_t i; \ - mca_btl_base_segment_t* macro_segments = frag->segments; \ - mca_pml_dr_buffer_t** buffers = frag->buffers; \ - \ - /* init recv_frag */ \ - frag->btl = btl; \ - frag->hdr = *(mca_pml_dr_hdr_t*)hdr; \ - frag->num_segments = cnt; \ - frag->csum = 0; \ - frag->proc = oproc; \ - \ - /* copy over data */ \ - for(i=0; iaddr; \ - macro_segments[i].seg_len = segs[i].seg_len; \ - memcpy(buff->addr, \ - segs[i].seg_addr.pval, \ - segs[i].seg_len); \ - } \ +#define MCA_PML_DR_RECV_FRAG_INIT(frag,oproc,hdr,segs,cnt,btl,csum) \ +do { \ + size_t i; \ + uint32_t ui1 = 0; \ + uint32_t ui2 = 0; \ + mca_btl_base_segment_t* macro_segments = frag->segments; \ + mca_pml_dr_buffer_t** buffers = frag->buffers; \ + \ + /* init recv_frag */ \ + frag->btl = btl; \ + frag->hdr = *(mca_pml_dr_hdr_t*)hdr; \ + frag->num_segments = cnt; \ + frag->csum = 0; \ + frag->proc = oproc; \ + \ + /* copy over data */ \ + for(i=0; iaddr; \ + macro_segments[i].seg_len = segs[i].seg_len; \ + csum += OPAL_CSUM_BCOPY_PARTIAL( \ + segs[i].seg_addr.pval, \ + buff->addr, \ + segs[i].seg_len, \ + segs[i].seg_len, \ + &ui1, \ + &ui2); \ + } \ } while(0) diff --git a/ompi/mca/pml/dr/pml_dr_recvreq.c b/ompi/mca/pml/dr/pml_dr_recvreq.c index 7959b3f54d..2367394dec 100644 --- a/ompi/mca/pml/dr/pml_dr_recvreq.c +++ b/ompi/mca/pml/dr/pml_dr_recvreq.c @@ -30,6 +30,32 @@ #include "ompi/mca/bml/base/base.h" #include "orte/mca/errmgr/errmgr.h" + +#define MCA_PML_DR_RECV_REQUEST_ACK(recvreq,hdr,csum) \ +if(csum != hdr->hdr_match.hdr_csum) { \ + /* failed the csum, put the request back on the list for \ + * matching later on retransmission \ + */ \ + if(recvreq->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { \ + mca_pml_dr_recv_request_match_wild(recvreq); \ + } else { \ + mca_pml_dr_recv_request_match_specific(recvreq); \ + } \ + mca_pml_dr_recv_frag_send_ack(comm_proc->ompi_proc, \ + &hdr->hdr_common, \ + hdr->hdr_match.hdr_src_ptr, \ + 0); \ + opal_output(0, "%s:%d: [rank %d -> rank %d] " \ + "data checksum failed 0x%08x != 0x%08x\n", \ + __FILE__, __LINE__, \ + hdr->hdr_common.hdr_src, hdr->hdr_common.hdr_dst, \ + csum, hdr->hdr_match.hdr_csum); \ + bytes_received = bytes_delivered = 0; \ +} else { \ + mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_match, 1); \ +} + + static mca_pml_dr_recv_frag_t* mca_pml_dr_recv_request_match_specific_proc( mca_pml_dr_recv_request_t* request, mca_pml_dr_comm_proc_t* proc); @@ -288,23 +314,7 @@ static void mca_pml_dr_recv_request_vfrag_ack( bytes_received, bytes_delivered, csum); - if(csum != hdr->hdr_match.hdr_csum) { - /* failed the csum, put the request - back on the list for matching later - at retransmission */ - if(recvreq->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { - mca_pml_dr_recv_request_match_wild(recvreq); - } else { - mca_pml_dr_recv_request_match_specific(recvreq); - } - mca_pml_dr_recv_frag_send_ack(comm_proc->ompi_proc, - &hdr->hdr_common, - hdr->hdr_match.hdr_src_ptr, - 0); - assert(0); - return; - } - mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_match, 1); + MCA_PML_DR_RECV_REQUEST_ACK(recvreq,hdr,csum); break; case MCA_PML_DR_HDR_TYPE_RNDV: @@ -322,23 +332,7 @@ static void mca_pml_dr_recv_request_vfrag_ack( bytes_received, bytes_delivered, csum); - if(csum != hdr->hdr_match.hdr_csum) { - /* failed the csum, put the request - back on the list for matching later - at retransmission */ - if(recvreq->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { - mca_pml_dr_recv_request_match_wild(recvreq); - } else { - mca_pml_dr_recv_request_match_specific(recvreq); - } - mca_pml_dr_recv_frag_send_ack(comm_proc->ompi_proc, - &hdr->hdr_common, - hdr->hdr_match.hdr_src_ptr, - 0); - assert(0); - return; - } - mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_match, 1); + MCA_PML_DR_RECV_REQUEST_ACK(recvreq,hdr,csum); break; case MCA_PML_DR_HDR_TYPE_FRAG: diff --git a/ompi/mca/pml/dr/pml_dr_recvreq.h b/ompi/mca/pml/dr/pml_dr_recvreq.h index e2c3307841..4ac5957acb 100644 --- a/ompi/mca/pml/dr/pml_dr_recvreq.h +++ b/ompi/mca/pml/dr/pml_dr_recvreq.h @@ -24,7 +24,6 @@ #include "ompi_config.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/pml/base/pml_base_recvreq.h" -#include "ompi/datatype/datatype_checksum.h" #include "pml_dr.h" #include "pml_dr_proc.h" @@ -230,6 +229,7 @@ do { (request)->req_recv.req_base.req_datatype, \ (request)->req_recv.req_base.req_count, \ (request)->req_recv.req_base.req_addr, \ + CONVERTOR_WITH_CHECKSUM, \ &(request)->req_recv.req_convertor ); \ } else { \ (request)->req_proc = NULL; \ @@ -281,7 +281,7 @@ do { csum = request->req_recv.req_convertor.checksum; \ } else { \ bytes_delivered = 0; \ - csum = OMPI_CSUM_ZERO; \ + csum = OPAL_CSUM_ZERO; \ } \ } while (0) diff --git a/ompi/mca/pml/dr/pml_dr_sendreq.c b/ompi/mca/pml/dr/pml_dr_sendreq.c index 25aba03b40..e692ea57d9 100644 --- a/ompi/mca/pml/dr/pml_dr_sendreq.c +++ b/ompi/mca/pml/dr/pml_dr_sendreq.c @@ -429,7 +429,7 @@ int mca_pml_dr_send_request_start_copy( hdr->hdr_common.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OMPI_CSUM_ZERO; + hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OPAL_CSUM_ZERO; hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; hdr->hdr_common.hdr_csum = opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t)); @@ -499,7 +499,7 @@ int mca_pml_dr_send_request_start_prepare( hdr->hdr_common.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OMPI_CSUM_ZERO; + hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OPAL_CSUM_ZERO; hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; hdr->hdr_common.hdr_csum = opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t)); @@ -573,7 +573,7 @@ int mca_pml_dr_send_request_start_rndv( hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0; - hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OMPI_CSUM_ZERO; + hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_convertor.checksum : OPAL_CSUM_ZERO; hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_common.hdr_csum = opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t)); diff --git a/ompi/mca/pml/dr/pml_dr_sendreq.h b/ompi/mca/pml/dr/pml_dr_sendreq.h index 9c63bb678d..7635c0d826 100644 --- a/ompi/mca/pml/dr/pml_dr_sendreq.h +++ b/ompi/mca/pml/dr/pml_dr_sendreq.h @@ -86,27 +86,53 @@ OBJ_CLASS_DECLARATION(mca_pml_dr_send_request_t); } -#define MCA_PML_DR_SEND_REQUEST_INIT( \ - sendreq, \ - buf, \ - count, \ - datatype, \ - dst, \ - tag, \ - comm, \ - sendmode, \ - persistent) \ -{ \ - MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \ - buf, \ - count, \ - datatype, \ - dst, \ - tag, \ - comm, \ - sendmode, \ - persistent); \ -} +#define MCA_PML_DR_SEND_REQUEST_INIT( \ + sendreq, \ + addr, \ + count, \ + datatype, \ + peer, \ + tag, \ + comm, \ + sendmode, \ + persistent) \ +do { \ + /* increment reference counts */ \ + OBJ_RETAIN(comm); \ + OBJ_RETAIN(datatype); \ + \ + OMPI_REQUEST_INIT(&(sendreq)->req_send.req_base.req_ompi, persistent); \ + (sendreq)->req_send.req_addr = addr; \ + (sendreq)->req_send.req_count = count; \ + (sendreq)->req_send.req_datatype = datatype; \ + (sendreq)->req_send.req_send_mode = sendmode; \ + (sendreq)->req_send.req_base.req_addr = addr; \ + (sendreq)->req_send.req_base.req_count = count; \ + (sendreq)->req_send.req_base.req_datatype = datatype; \ + (sendreq)->req_send.req_base.req_peer = (int32_t)peer; \ + (sendreq)->req_send.req_base.req_tag = (int32_t)tag; \ + (sendreq)->req_send.req_base.req_comm = comm; \ + (sendreq)->req_send.req_base.req_pml_complete = (persistent ? true : false); \ + (sendreq)->req_send.req_base.req_free_called = false; \ + (sendreq)->req_send.req_base.req_ompi.req_status._cancelled = 0; \ + \ + /* initialize datatype convertor for this request */ \ + if(count > 0) { \ + /* We will create a convertor specialized for the */ \ + /* remote architecture and prepared with the datatype. */ \ + ompi_convertor_copy_and_prepare_for_send( \ + (sendreq)->req_send.req_base.req_proc->proc_convertor, \ + (sendreq)->req_send.req_base.req_datatype, \ + (sendreq)->req_send.req_base.req_count, \ + (sendreq)->req_send.req_base.req_addr, \ + CONVERTOR_WITH_CHECKSUM, \ + &(sendreq)->req_send.req_convertor ); \ + ompi_convertor_get_packed_size(&(sendreq)->req_send.req_convertor, \ + &((sendreq)->req_send.req_bytes_packed) ); \ + } else { \ + (sendreq)->req_send.req_bytes_packed = 0; \ + } \ +} while(0) /** diff --git a/ompi/mca/pml/dr/pml_dr_vfrag.c b/ompi/mca/pml/dr/pml_dr_vfrag.c index 0abcfcdeac..eae66a755c 100644 --- a/ompi/mca/pml/dr/pml_dr_vfrag.c +++ b/ompi/mca/pml/dr/pml_dr_vfrag.c @@ -90,7 +90,7 @@ void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* data) { OPAL_THREAD_LOCK(&ompi_request_lock); vfrag->vf_send_cnt++; if(vfrag->vf_send_cnt > mca_pml_dr.timer_ack_max_count) { - opal_output(0, "ack retry count exceeded! %s:%d FATAL", __FILE__, __LINE__); + opal_output(0, "%s:%d: maximum ack retry count exceeded: FATAL", __FILE__, __LINE__); orte_errmgr.abort(); } vfrag->vf_idx = 1; diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index a6ee68acb4..0f83ca682d 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -53,6 +53,7 @@ mca_pml_ob1_t mca_pml_ob1 = { mca_pml_ob1_iprobe, mca_pml_ob1_probe, mca_pml_ob1_start, + mca_pml_ob1_dump, 32768, INT_MAX } @@ -176,3 +177,29 @@ int mca_pml_ob1_component_fini(void) return OMPI_SUCCESS; } +/* + * diagnostics + */ + +void mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose) +{ + struct mca_pml_comm_t* pml_comm = comm->c_pml_comm; + size_t i; + + /* iterate through all procs on communicator */ + for(i=0; inum_procs; i++) { + mca_pml_ob1_comm_proc_t* proc = &pml_comm->procs[i]; + mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_ompi->proc_pml; + size_t n; + + opal_output(0, "[Rank %d]\n", i); + /* dump all receive queues */ + + /* dump all btls */ + for(n=0; nbtl_eager.arr_size; n++) { + mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[i]; + bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose); + } + } +} + diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index 4db28a7fbb..3788c3bef5 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -200,6 +200,11 @@ extern int mca_pml_ob1_recv( ompi_status_public_t* status ); +extern void mca_pml_ob1_dump( + struct ompi_communicator_t* comm, + int verbose +); + extern int mca_pml_ob1_progress(void); extern int mca_pml_ob1_start( diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index c7b8ab28cf..599a0fcf9d 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -213,6 +213,7 @@ do { (request)->req_recv.req_base.req_datatype, \ (request)->req_recv.req_base.req_count, \ (request)->req_recv.req_base.req_addr, \ + 0, \ &(request)->req_recv.req_convertor ); \ ompi_convertor_get_unpacked_size( &(request)->req_recv.req_convertor, \ &(request)->req_bytes_delivered ); \ diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index c83a8cf96f..65b396ef45 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -829,6 +829,7 @@ int mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq) sendreq->req_send.req_base.req_datatype, sendreq->req_send.req_base.req_count, sendreq->req_send.req_base.req_addr, + 0, &convertor); ompi_convertor_set_position(&convertor, &position); OBJ_DESTRUCT( &convertor ); diff --git a/ompi/mca/pml/pml.h b/ompi/mca/pml/pml.h index b3eb76ad60..84d56b85de 100644 --- a/ompi/mca/pml/pml.h +++ b/ompi/mca/pml/pml.h @@ -443,6 +443,18 @@ typedef int (*mca_pml_base_module_null_fn_t)( struct ompi_request_t** request ); +/** + * Diagnostics function. + * + * @param request (IN) Communicator + * @return OMPI_SUCCESS or failure status. + * + */ +typedef int (*mca_pml_base_module_dump_fn_t)( + struct ompi_communicator_t* comm, + int verbose +); + /** * PML instance. @@ -469,6 +481,9 @@ struct mca_pml_base_module_1_0_0_t { mca_pml_base_module_probe_fn_t pml_probe; mca_pml_base_module_start_fn_t pml_start; + /* diagnostics */ + mca_pml_base_module_dump_fn_t pml_dump; + /* maximum constant sizes */ int pml_max_contextid; int pml_max_tag; diff --git a/ompi/mpi/c/pack.c b/ompi/mpi/c/pack.c index 5a9520d8ea..ab00e08688 100644 --- a/ompi/mpi/c/pack.c +++ b/ompi/mpi/c/pack.c @@ -62,7 +62,7 @@ int MPI_Pack(void *inbuf, int incount, MPI_Datatype datatype, OBJ_CONSTRUCT( &local_convertor, ompi_convertor_t ); /* the resulting convertor will be set to the position ZERO */ ompi_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, datatype, - incount, inbuf, &local_convertor ); + incount, inbuf, 0, &local_convertor ); /* Check for truncation */ ompi_convertor_get_packed_size( &local_convertor, &size ); diff --git a/ompi/mpi/c/pack_external.c b/ompi/mpi/c/pack_external.c index 2d1fd866dc..01054ab004 100644 --- a/ompi/mpi/c/pack_external.c +++ b/ompi/mpi/c/pack_external.c @@ -58,7 +58,7 @@ int MPI_Pack_external(char *datarep, void *inbuf, int incount, /* The resulting convertor will be set to the position zero */ ompi_convertor_copy_and_prepare_for_send( ompi_mpi_external32_convertor, - datatype, incount, inbuf, &local_convertor ); + datatype, incount, inbuf, 0, &local_convertor ); /* Check for truncation */ ompi_convertor_get_packed_size( &local_convertor, &size ); diff --git a/ompi/mpi/c/pack_external_size.c b/ompi/mpi/c/pack_external_size.c index 3e26cb562f..a53b789f25 100644 --- a/ompi/mpi/c/pack_external_size.c +++ b/ompi/mpi/c/pack_external_size.c @@ -49,7 +49,7 @@ int MPI_Pack_external_size(char *datarep, int incount, } /* the resulting convertor will be set to the position ZERO */ ompi_convertor_copy_and_prepare_for_send( ompi_mpi_external32_convertor, - datatype, incount, NULL, &local_convertor ); + datatype, incount, NULL, 0, &local_convertor ); ompi_convertor_get_packed_size( &local_convertor, &length ); *size = (MPI_Aint)length; diff --git a/ompi/mpi/c/pack_size.c b/ompi/mpi/c/pack_size.c index 4fb3c6b214..5ba3fe497f 100644 --- a/ompi/mpi/c/pack_size.c +++ b/ompi/mpi/c/pack_size.c @@ -52,7 +52,7 @@ int MPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, OBJ_CONSTRUCT( &local_convertor, ompi_convertor_t ); /* the resulting convertor will be set to the position ZERO */ - ompi_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, datatype, incount, NULL, &local_convertor ); + ompi_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, datatype, incount, NULL, 0, &local_convertor ); ompi_convertor_get_packed_size( &local_convertor, &length ); *size = (int)length; diff --git a/ompi/mpi/c/sendrecv_replace.c b/ompi/mpi/c/sendrecv_replace.c index 9867dd93be..e54ac54553 100644 --- a/ompi/mpi/c/sendrecv_replace.c +++ b/ompi/mpi/c/sendrecv_replace.c @@ -81,7 +81,7 @@ int MPI_Sendrecv_replace(void * buf, int count, MPI_Datatype datatype, /* initialize convertor to unpack recv buffer */ OBJ_CONSTRUCT(&convertor, ompi_convertor_t); ompi_convertor_copy_and_prepare_for_recv( proc->proc_convertor, datatype, - count, buf, &convertor ); + count, buf, 0, &convertor ); /* setup a buffer for recv */ ompi_convertor_get_packed_size( &convertor, &packed_size ); diff --git a/ompi/mpi/c/unpack.c b/ompi/mpi/c/unpack.c index 6b36f45200..2cf2856b3c 100644 --- a/ompi/mpi/c/unpack.c +++ b/ompi/mpi/c/unpack.c @@ -65,7 +65,7 @@ int MPI_Unpack(void *inbuf, int insize, int *position, if( insize > 0 ) { OBJ_CONSTRUCT( &local_convertor, ompi_convertor_t ); /* the resulting convertor will be set the the position ZERO */ - ompi_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor, datatype, outcount, outbuf, &local_convertor ); + ompi_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor, datatype, outcount, outbuf, 0, &local_convertor ); /* Check for truncation */ ompi_convertor_get_packed_size( &local_convertor, &size ); diff --git a/ompi/mpi/c/unpack_external.c b/ompi/mpi/c/unpack_external.c index c16447e575..c822072626 100644 --- a/ompi/mpi/c/unpack_external.c +++ b/ompi/mpi/c/unpack_external.c @@ -55,7 +55,7 @@ int MPI_Unpack_external (char *datarep, void *inbuf, MPI_Aint insize, /* the resulting convertor will be set to the position ZERO */ ompi_convertor_copy_and_prepare_for_send( ompi_mpi_external32_convertor, - datatype, outcount, NULL, &local_convertor ); + datatype, outcount, NULL, 0, &local_convertor ); /* Check for truncation */ ompi_convertor_get_packed_size( &local_convertor, &size ); diff --git a/opal/util/crc.h b/opal/util/crc.h index 5b7549aed1..bb1d6f598e 100644 --- a/opal/util/crc.h +++ b/opal/util/crc.h @@ -22,6 +22,15 @@ #define CRC_POLYNOMIAL ((unsigned int)0x04c11db7) #define CRC_INITIAL_REGISTER ((unsigned int)0xffffffff) + +#define OPAL_CSUM( SRC, LEN ) opal_uicsum( SRC, LEN ) +#define OPAL_CSUM_PARTIAL( SRC, LEN, UI1, UI2 ) \ + opal_uicsum_partial( SRC, LEN, UI1, UI2 ) +#define OPAL_CSUM_BCOPY_PARTIAL( SRC, DST, LEN1, LEN2, UI1, UI2 ) \ + opal_bcopy_uicsum_partial( SRC, DST, LEN1, LEN2, UI1, UI2 ) +#define OPAL_CSUM_ZERO 0 + + unsigned long opal_bcopy_csum_partial( const void * source,