diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c b/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c index 9779a09df1..5e5e5d414a 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c @@ -12,6 +12,8 @@ * reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -274,7 +276,6 @@ ompi_osc_pt2pt_complete(ompi_win_t *win) int i; int *ranks = NULL; ompi_group_t *group; - int my_rank = ompi_comm_rank (module->comm); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete entering...")); @@ -307,7 +308,8 @@ ompi_osc_pt2pt_complete(ompi_win_t *win) At the same time, clean out the outgoing count for the next round. */ for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) { - if (my_rank == ranks[i]) { + ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, ranks[i]); + if (ompi_proc_local() == proc) { /* shortcut for self */ OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self complete")); module->num_complete_msgs++; @@ -316,7 +318,12 @@ ompi_osc_pt2pt_complete(ompi_win_t *win) complete_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE; complete_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG + complete_req.padding[0] = 0; + complete_req.padding[1] = 0; +#endif complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]]; + osc_pt2pt_hton(&complete_req, proc); peer = module->peers + ranks[i]; @@ -388,7 +395,6 @@ ompi_osc_pt2pt_post(ompi_group_t *group, int ret = OMPI_SUCCESS; ompi_osc_pt2pt_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_header_post_t post_req; - int my_rank = ompi_comm_rank(module->comm); /* can't check for all access epoch here due to fence */ if (module->pw_group) { @@ -430,17 +436,19 @@ ompi_osc_pt2pt_post(ompi_group_t *group, /* send a hello counter to everyone in group */ for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Sending post message to rank %d", ranks[i])); + ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, ranks[i]); /* shortcut for self */ - if (my_rank == ranks[i]) { + if (ompi_proc_local() == proc) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self post")); - osc_pt2pt_incoming_post (module, my_rank); + osc_pt2pt_incoming_post (module, ompi_comm_rank(module->comm)); continue; } post_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_POST; post_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID; post_req.windx = ompi_comm_get_cid(module->comm); + osc_pt2pt_hton(&post_req, proc); /* we don't want to send any data, since we're the exposure epoch only, so use an unbuffered send */ diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_comm.c b/ompi/mca/osc/pt2pt/osc_pt2pt_comm.c index e915cace46..1ba8e287a5 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_comm.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_comm.c @@ -12,6 +12,8 @@ * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -403,6 +405,7 @@ static inline int ompi_osc_pt2pt_put_w_req (void *origin_addr, int origin_count, if (!is_long_msg) { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_PUT; + osc_pt2pt_hton(header, proc); osc_pt2pt_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_dt); @@ -414,8 +417,8 @@ static inline int ompi_osc_pt2pt_put_w_req (void *origin_addr, int origin_count, } } else { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_PUT_LONG; - header->tag = tag; + osc_pt2pt_hton(header, proc); /* increase the outgoing signal count */ ompi_osc_signal_outgoing (module, target, 1); @@ -580,6 +583,7 @@ ompi_osc_pt2pt_accumulate_w_req (void *origin_addr, int origin_count, if (!is_long_msg) { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC; + osc_pt2pt_hton(header, proc); osc_pt2pt_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_dt); @@ -591,8 +595,8 @@ ompi_osc_pt2pt_accumulate_w_req (void *origin_addr, int origin_count, } } else { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG; - header->tag = tag; + osc_pt2pt_hton(header, proc); OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "acc: starting long accumulate with tag %d", tag)); @@ -708,6 +712,7 @@ int ompi_osc_pt2pt_compare_and_swap (void *origin_addr, void *compare_addr, header->len = frag_len; header->displacement = target_disp; header->tag = tag; + osc_pt2pt_hton(header, proc); ptr += sizeof(ompi_osc_pt2pt_header_cswap_t); ret = ompi_datatype_get_pack_description(dt, &packed_ddt); @@ -880,6 +885,7 @@ static inline int ompi_osc_pt2pt_rget_internal (void *origin_addr, int origin_co header->count = target_count; header->displacement = target_disp; header->tag = tag; + OSC_PT2PT_HTON(header, module, target); ptr += sizeof(ompi_osc_pt2pt_header_get_t); do { @@ -1115,6 +1121,7 @@ int ompi_osc_pt2pt_rget_accumulate_internal (void *origin_addr, int origin_count header->displacement = target_disp; header->op = op->o_f_to_c_index; header->tag = tag; + ptr = (char *)(header + 1); do { @@ -1151,6 +1158,7 @@ int ompi_osc_pt2pt_rget_accumulate_internal (void *origin_addr, int origin_count if (!is_long_msg) { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC; + osc_pt2pt_hton(header, proc); if (&ompi_mpi_op_no_op.op != op) { osc_pt2pt_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, @@ -1158,6 +1166,7 @@ int ompi_osc_pt2pt_rget_accumulate_internal (void *origin_addr, int origin_count } } else { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG; + osc_pt2pt_hton(header, proc); ret = ompi_osc_pt2pt_isend_w_cb (origin_addr, origin_count, origin_datatype, target_rank, tag, module->comm, ompi_osc_pt2pt_req_comm_complete, pt2pt_request); diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c index 068edf9d79..4f1de661ea 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c @@ -12,7 +12,7 @@ * reserved. * Copyright (c) 2009-2011 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -561,7 +561,6 @@ static inline int process_get (ompi_osc_pt2pt_module_t* module, int target, static inline int osc_pt2pt_accumulate_buffer (void *target, void *source, size_t source_len, ompi_proc_t *proc, int count, ompi_datatype_t *datatype, ompi_op_t *op) { - void *buffer = source; int ret; assert (NULL != target && NULL != source); @@ -576,6 +575,7 @@ static inline int osc_pt2pt_accumulate_buffer (void *target, void *source, size_ ompi_datatype_t *primitive_datatype = NULL; uint32_t primitive_count; size_t buflen; + void *buffer; ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count); primitive_count *= count; @@ -589,20 +589,19 @@ static inline int osc_pt2pt_accumulate_buffer (void *target, void *source, size_ return OMPI_ERR_OUT_OF_RESOURCE; } - osc_pt2pt_copy_on_recv (buffer, source, source_len, proc, count, datatype); - } + osc_pt2pt_copy_on_recv (buffer, source, source_len, proc, primitive_count, primitive_datatype); + + ret = ompi_osc_base_process_op(target, buffer, source_len, datatype, + count, op); + + free(buffer); + } else #endif /* copy the data from the temporary buffer into the user window */ - ret = ompi_osc_base_process_op(target, buffer, source_len, datatype, + ret = ompi_osc_base_process_op(target, source, source_len, datatype, count, op); -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->super.proc_arch != ompi_proc_local()->super.proc_arch) { - free(buffer); - } -#endif - return ret; } @@ -682,8 +681,19 @@ static int accumulate_cb (ompi_request_t *request) /* no more requests needed before the buffer can be accumulated */ if (acc_data->source) { - ret = osc_pt2pt_accumulate_buffer (acc_data->target, acc_data->source, acc_data->source_len, - acc_data->proc, acc_data->count, acc_data->datatype, acc_data->op); + ompi_datatype_t *primitive_datatype = NULL; + uint32_t primitive_count; + + assert (NULL != acc_data->target && NULL != acc_data->source); + + ompi_osc_base_get_primitive_type_info(acc_data->datatype, &primitive_datatype, &primitive_count); + primitive_count *= acc_data->count; + + if (acc_data->op == &ompi_mpi_op_replace.op) { + ret = ompi_datatype_sndrcv(acc_data->source, primitive_count, primitive_datatype, acc_data->target, acc_data->count, acc_data->datatype); + } else { + ret = ompi_osc_base_process_op(acc_data->target, acc_data->source, acc_data->source_len, acc_data->datatype, acc_data->count, acc_data->op); + } } /* drop the accumulate lock */ @@ -1215,6 +1225,7 @@ static inline int process_get_acc(ompi_osc_pt2pt_module_t *module, int source, struct ompi_datatype_t *datatype; void *buffer = NULL; uint64_t data_len; + ompi_proc_t * proc; int ret; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, @@ -1222,7 +1233,7 @@ static inline int process_get_acc(ompi_osc_pt2pt_module_t *module, int source, ompi_comm_rank(module->comm), source)); - ret = datatype_create (module, source, NULL, &datatype, (void **) &data); + ret = datatype_create (module, source, &proc, &datatype, (void **) &data); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } @@ -1232,13 +1243,18 @@ static inline int process_get_acc(ompi_osc_pt2pt_module_t *module, int source, if (0 == ompi_osc_pt2pt_accumulate_trylock (module)) { /* make a copy of the data since the buffer needs to be returned */ if (data_len) { + ompi_datatype_t *primitive_datatype = NULL; + uint32_t primitive_count; buffer = malloc (data_len); if (OPAL_UNLIKELY(NULL == buffer)) { OBJ_RELEASE(datatype); return OMPI_ERR_OUT_OF_RESOURCE; } - memcpy (buffer, data, data_len); + ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count); + primitive_count *= acc_header->count; + + osc_pt2pt_copy_on_recv (buffer, data, data_len, proc, primitive_count, primitive_datatype); } ret = ompi_osc_pt2pt_gacc_start (module, source, buffer, data_len, datatype, @@ -1543,6 +1559,7 @@ static inline int process_frag (ompi_osc_pt2pt_module_t *module, header->base.flags)); if (OPAL_LIKELY(!(header->base.flags & OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE))) { + osc_pt2pt_ntoh(header); switch (header->base.type) { case OMPI_OSC_PT2PT_HDR_TYPE_PUT: ret = process_put(module, frag->source, &header->put); @@ -1617,8 +1634,8 @@ static inline int process_frag (ompi_osc_pt2pt_module_t *module, static int ompi_osc_pt2pt_callback (ompi_request_t *request) { ompi_osc_pt2pt_module_t *module = (ompi_osc_pt2pt_module_t *) request->req_complete_cb_data; - ompi_osc_pt2pt_header_base_t *base_header = - (ompi_osc_pt2pt_header_base_t *) module->incoming_buffer; + ompi_osc_pt2pt_header_t *base_header = + (ompi_osc_pt2pt_header_t *) module->incoming_buffer; size_t incoming_length = request->req_status._ucount; int source = request->req_status.MPI_SOURCE; @@ -1628,14 +1645,15 @@ static int ompi_osc_pt2pt_callback (ompi_request_t *request) OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "received pt2pt callback for fragment. source = %d, count = %u, type = 0x%x", - source, (unsigned) incoming_length, base_header->type)); + source, (unsigned) incoming_length, base_header->base.type)); - switch (base_header->type) { + osc_pt2pt_ntoh(base_header); + switch (base_header->base.type) { case OMPI_OSC_PT2PT_HDR_TYPE_FRAG: process_frag(module, (ompi_osc_pt2pt_frag_header_t *) base_header); /* only data fragments should be included in the completion counters */ - mark_incoming_completion (module, (base_header->flags & OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET) ? source : MPI_PROC_NULL); + mark_incoming_completion (module, (base_header->base.flags & OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET) ? source : MPI_PROC_NULL); break; case OMPI_OSC_PT2PT_HDR_TYPE_POST: (void) osc_pt2pt_incoming_post (module, source); @@ -1652,7 +1670,7 @@ static int ompi_osc_pt2pt_callback (ompi_request_t *request) default: OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "received unexpected message of type %x", - (int) base_header->type)); + (int) base_header->base.type)); } OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_frag.c b/ompi/mca/osc/pt2pt/osc_pt2pt_frag.c index 9bde910135..0e0c588bef 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_frag.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_frag.c @@ -59,6 +59,7 @@ static int frag_send (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_frag_t *fr "osc pt2pt: frag_send called to %d, frag = %p, count = %d", frag->target, (void *) frag, count)); + OSC_PT2PT_HTON(frag->header, module, frag->target); return ompi_osc_pt2pt_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_PT2PT_FRAG_TAG, module->comm, frag_send_cb, frag); } diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_header.h b/ompi/mca/osc/pt2pt/osc_pt2pt_header.h index 51d20ccd25..6b5f00d2ce 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_header.h +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_header.h @@ -13,6 +13,8 @@ * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,6 +30,7 @@ #endif #include "opal/types.h" +#include "opal/util/arch.h" enum ompi_osc_pt2pt_hdr_type_t { OMPI_OSC_PT2PT_HDR_TYPE_PUT = 0x01, @@ -79,9 +82,9 @@ struct ompi_osc_pt2pt_header_acc_t { uint16_t tag; uint32_t count; - uint32_t op; uint64_t len; uint64_t displacement; + uint32_t op; }; typedef struct ompi_osc_pt2pt_header_acc_t ompi_osc_pt2pt_header_acc_t; @@ -97,6 +100,9 @@ typedef struct ompi_osc_pt2pt_header_get_t ompi_osc_pt2pt_header_get_t; struct ompi_osc_pt2pt_header_complete_t { ompi_osc_pt2pt_header_base_t base; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t padding[2]; +#endif int frag_count; }; typedef struct ompi_osc_pt2pt_header_complete_t ompi_osc_pt2pt_header_complete_t; @@ -105,7 +111,6 @@ struct ompi_osc_pt2pt_header_cswap_t { ompi_osc_pt2pt_header_base_t base; uint16_t tag; - uint32_t len; uint64_t displacement; }; @@ -119,6 +124,9 @@ typedef struct ompi_osc_pt2pt_header_post_t ompi_osc_pt2pt_header_post_t; struct ompi_osc_pt2pt_header_lock_t { ompi_osc_pt2pt_header_base_t base; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t padding[2]; +#endif int32_t lock_type; uint64_t lock_ptr; }; @@ -134,20 +142,29 @@ typedef struct ompi_osc_pt2pt_header_lock_ack_t ompi_osc_pt2pt_header_lock_ack_t struct ompi_osc_pt2pt_header_unlock_t { ompi_osc_pt2pt_header_base_t base; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t padding[2]; +#endif int32_t lock_type; - uint32_t frag_count; uint64_t lock_ptr; + uint32_t frag_count; }; typedef struct ompi_osc_pt2pt_header_unlock_t ompi_osc_pt2pt_header_unlock_t; struct ompi_osc_pt2pt_header_unlock_ack_t { ompi_osc_pt2pt_header_base_t base; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t padding[6]; +#endif uint64_t lock_ptr; }; typedef struct ompi_osc_pt2pt_header_unlock_ack_t ompi_osc_pt2pt_header_unlock_ack_t; struct ompi_osc_pt2pt_header_flush_t { ompi_osc_pt2pt_header_base_t base; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t padding[2]; +#endif uint32_t frag_count; uint64_t serial_number; }; @@ -155,6 +172,9 @@ typedef struct ompi_osc_pt2pt_header_flush_t ompi_osc_pt2pt_header_flush_t; struct ompi_osc_pt2pt_header_flush_ack_t { ompi_osc_pt2pt_header_base_t base; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + uint8_t padding[6]; +#endif uint64_t serial_number; }; typedef struct ompi_osc_pt2pt_header_flush_ack_t ompi_osc_pt2pt_header_flush_ack_t; @@ -186,4 +206,248 @@ union ompi_osc_pt2pt_header_t { }; typedef union ompi_osc_pt2pt_header_t ompi_osc_pt2pt_header_t; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT +#define MCA_OSC_PT2PT_FRAG_HDR_NTOH(h) \ + (h).windx = ntohs((h).windx); \ + (h).source = ntohl((h).source); \ + (h).num_ops = ntohl((h).num_ops); \ + (h).pad = ntohl((h).pad); +#define MCA_OSC_PT2PT_FRAG_HDR_HTON(h) \ + (h).windx = htons((h).windx); \ + (h).source = htonl((h).source); \ + (h).num_ops = htonl((h).num_ops); \ + (h).pad = htonl((h).pad); + +#define MCA_OSC_PT2PT_PUT_HDR_NTOH(h) \ + (h).tag = ntohs((h).tag); \ + (h).count = ntohl((h).count); \ + (h).len = ntoh64((h).len); \ + (h).displacement = ntoh64((h).displacement); +#define MCA_OSC_PT2PT_PUT_HDR_HTON(h) \ + (h).tag = htons((h).tag); \ + (h).count = htonl((h).count); \ + (h).len = hton64((h).len); \ + (h).displacement = hton64((h).displacement); + +#define MCA_OSC_PT2PT_GET_HDR_NTOH(h) \ + (h).tag = ntohs((h).tag); \ + (h).count = ntohl((h).count); \ + (h).len = ntoh64((h).len); \ + (h).displacement = ntoh64((h).displacement); +#define MCA_OSC_PT2PT_GET_HDR_HTON(h) \ + (h).tag = htons((h).tag); \ + (h).count = htonl((h).count); \ + (h).len = hton64((h).len); \ + (h).displacement = hton64((h).displacement); + +#define MCA_OSC_PT2PT_ACC_HDR_NTOH(h) \ + (h).tag = ntohs((h).tag); \ + (h).count = ntohl((h).count); \ + (h).len = ntoh64((h).len); \ + (h).displacement = ntoh64((h).displacement);\ + (h).op = ntohl((h).op); +#define MCA_OSC_PT2PT_ACC_HDR_HTON(h) \ + (h).tag = htons((h).tag); \ + (h).count = htonl((h).count); \ + (h).len = hton64((h).len); \ + (h).displacement = hton64((h).displacement);\ + (h).op = htonl((h).op); + +#define MCA_OSC_PT2PT_LOCK_HDR_NTOH(h) \ + (h).lock_type = ntohl((h).lock_type); \ + (h).lock_ptr = ntoh64((h).lock_ptr) +#define MCA_OSC_PT2PT_LOCK_HDR_HTON(h) \ + (h).lock_type = htonl((h).lock_type); \ + (h).lock_ptr = hton64((h).lock_ptr) + +#define MCA_OSC_PT2PT_UNLOCK_HDR_NTOH(h) \ + (h).lock_type = ntohl((h).lock_type); \ + (h).lock_ptr = ntoh64((h).lock_ptr); \ + (h).frag_count = ntohl((h).frag_count) +#define MCA_OSC_PT2PT_UNLOCK_HDR_HTON(h) \ + (h).lock_type = htonl((h).lock_type); \ + (h).lock_ptr = hton64((h).lock_ptr); \ + (h).frag_count = htonl((h).frag_count) + +#define MCA_OSC_PT2PT_LOCK_ACK_HDR_NTOH(h) \ + (h).windx = ntohs((h).windx); \ + (h).source = ntohl((h).source); \ + (h).lock_ptr = ntoh64((h).lock_ptr) +#define MCA_OSC_PT2PT_LOCK_ACK_HDR_HTON(h) \ + (h).windx = htonl((h).windx); \ + (h).source= htonl((h).source); \ + (h).lock_ptr = hton64((h).lock_ptr) + +#define MCA_OSC_PT2PT_UNLOCK_ACK_HDR_NTOH(h) \ + (h).lock_ptr = ntoh64((h).lock_ptr); +#define MCA_OSC_PT2PT_UNLOCK_ACK_HDR_HTON(h) \ + (h).lock_ptr = hton64((h).lock_ptr); + +#define MCA_OSC_PT2PT_COMPLETE_HDR_NTOH(h) \ + (h).frag_count = ntohl((h).frag_count) +#define MCA_OSC_PT2PT_COMPLETE_HDR_HTON(h) \ + (h).frag_count = htonl((h).frag_count) + +#define MCA_OSC_PT2PT_FLUSH_HDR_NTOH(h) \ + (h).frag_count = ntohl((h).frag_count); \ + (h).serial_number = ntoh64((h).serial_number) +#define MCA_OSC_PT2PT_FLUSH_HDR_HTON(h) \ + (h).frag_count = htonl((h).frag_count); \ + (h).serial_number = ntoh64((h).serial_number) + +#define MCA_OSC_PT2PT_FLUSH_ACK_HDR_NTOH(h) \ + (h).serial_number = ntoh64((h).serial_number) +#define MCA_OSC_PT2PT_FLUSH_ACK_HDR_HTON(h) \ + (h).serial_number = ntoh64((h).serial_number) + +#define MCA_OSC_PT2PT_POST_HDR_NTOH(h) \ + (h).windx = ntohs((h).windx) +#define MCA_OSC_PT2PT_POST_HDR_HTON(h) \ + (h).windx = htons((h).windx) + +#define MCA_OSC_PT2PT_CSWAP_HDR_NTOH(h) \ + (h).tag = ntohs((h).tag); \ + (h).len = ntohl((h).len); \ + (h).displacement = ntoh64((h).displacement) +#define MCA_OSC_PT2PT_CSWAP_HDR_HTON(h) \ + (h).tag = htons((h).tag); \ + (h).len = htonl((h).len); \ + (h).displacement = hton64((h).displacement) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + +#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT +static inline __opal_attribute_always_inline__ void +osc_pt2pt_ntoh(ompi_osc_pt2pt_header_t *hdr) +{ + if(!(hdr->base.flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO)) + return; + + switch(hdr->base.type) { + case OMPI_OSC_PT2PT_HDR_TYPE_PUT: + case OMPI_OSC_PT2PT_HDR_TYPE_PUT_LONG: + MCA_OSC_PT2PT_PUT_HDR_NTOH(hdr->put); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_ACC: + case OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG: + case OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC: + case OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG: + MCA_OSC_PT2PT_ACC_HDR_NTOH(hdr->acc); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_GET: + MCA_OSC_PT2PT_GET_HDR_NTOH(hdr->get); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_CSWAP: + case OMPI_OSC_PT2PT_HDR_TYPE_CSWAP_LONG: + MCA_OSC_PT2PT_CSWAP_HDR_NTOH(hdr->cswap); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE: + MCA_OSC_PT2PT_COMPLETE_HDR_NTOH(hdr->complete); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_POST: + MCA_OSC_PT2PT_POST_HDR_NTOH(hdr->post); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ: + MCA_OSC_PT2PT_LOCK_HDR_NTOH(hdr->lock); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK: + MCA_OSC_PT2PT_LOCK_ACK_HDR_NTOH(hdr->lock_ack); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ: + MCA_OSC_PT2PT_UNLOCK_HDR_NTOH(hdr->unlock); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK: + MCA_OSC_PT2PT_UNLOCK_ACK_HDR_NTOH(hdr->unlock_ack); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ: + MCA_OSC_PT2PT_FLUSH_HDR_NTOH(hdr->flush); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK: + MCA_OSC_PT2PT_FLUSH_ACK_HDR_NTOH(hdr->flush_ack); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_FRAG: + MCA_OSC_PT2PT_FRAG_HDR_NTOH(hdr->frag); + break; + default: + assert(0); + break; + } +} +#else +#define osc_pt2pt_ntoh(h) \ + do { } while (0) +#endif /* !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT +#define osc_pt2pt_hton(h, p) \ + osc_pt2pt_hton_intr((ompi_osc_pt2pt_header_t *)(h), (p)); +static inline __opal_attribute_always_inline__ void +osc_pt2pt_hton_intr(ompi_osc_pt2pt_header_t *hdr, const ompi_proc_t *proc) +{ +#ifdef WORDS_BIGENDIAN + hdr->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; +#else + if(!(proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) + return; + + hdr->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; + switch(hdr->base.type) { + case OMPI_OSC_PT2PT_HDR_TYPE_PUT: + case OMPI_OSC_PT2PT_HDR_TYPE_PUT_LONG: + MCA_OSC_PT2PT_PUT_HDR_HTON(hdr->put); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_ACC: + case OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG: + case OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC: + case OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG: + MCA_OSC_PT2PT_ACC_HDR_HTON(hdr->acc); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_GET: + MCA_OSC_PT2PT_GET_HDR_HTON(hdr->get); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_CSWAP: + case OMPI_OSC_PT2PT_HDR_TYPE_CSWAP_LONG: + MCA_OSC_PT2PT_CSWAP_HDR_HTON(hdr->cswap); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE: + MCA_OSC_PT2PT_COMPLETE_HDR_HTON(hdr->complete); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_POST: + MCA_OSC_PT2PT_POST_HDR_HTON(hdr->post); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ: + MCA_OSC_PT2PT_LOCK_HDR_HTON(hdr->lock); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK: + MCA_OSC_PT2PT_LOCK_ACK_HDR_HTON(hdr->lock_ack); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ: + MCA_OSC_PT2PT_UNLOCK_HDR_HTON(hdr->unlock); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK: + MCA_OSC_PT2PT_UNLOCK_ACK_HDR_HTON(hdr->unlock_ack); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ: + MCA_OSC_PT2PT_FLUSH_HDR_HTON(hdr->flush); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK: + MCA_OSC_PT2PT_FLUSH_ACK_HDR_HTON(hdr->flush_ack); + break; + case OMPI_OSC_PT2PT_HDR_TYPE_FRAG: + MCA_OSC_PT2PT_FRAG_HDR_HTON(hdr->frag); + break; + default: + assert(0); + break; + } +#endif /* WORDS_BIGENDIAN */ +} +#define OSC_PT2PT_HTON(h, m, r) \ + osc_pt2pt_hton_intr((ompi_osc_pt2pt_header_t *)(h), ompi_comm_peer_lookup((m)->comm, (r))); +#else +#define osc_pt2pt_hton(h, p) \ + do { } while (0) +#define OSC_PT2PT_HTON(h, m, r) \ + do { } while (0) +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + #endif /* OMPI_MCA_OSC_PT2PT_HDR_H */ diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c b/ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c index c4395da63d..9de24ae67c 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c @@ -13,6 +13,8 @@ * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -170,13 +172,19 @@ static inline void ompi_osc_pt2pt_unlock_self (ompi_osc_pt2pt_module_t *module, static inline int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock) { ompi_osc_pt2pt_header_lock_t lock_req; + int ret; /* generate a lock request */ lock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ; lock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG + lock_req.padding[0] = 0; + lock_req.padding[1] = 0; +#endif lock_req.lock_type = lock->type; lock_req.lock_ptr = (uint64_t) (uintptr_t) lock; + OSC_PT2PT_HTON(&lock_req, module, target); ret = ompi_osc_pt2pt_control_send (module, target, &lock_req, sizeof (lock_req)); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -197,9 +205,14 @@ static inline int ompi_osc_pt2pt_unlock_remote (ompi_osc_pt2pt_module_t *module, unlock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ; unlock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG + unlock_req.padding[0] = 0; + unlock_req.padding[1] = 0; +#endif unlock_req.frag_count = frag_count; unlock_req.lock_type = lock->type; unlock_req.lock_ptr = (uint64_t) (uintptr_t) lock; + OSC_PT2PT_HTON(&unlock_req, module, target); if (peer->active_frag && peer->active_frag->remain_len < sizeof (unlock_req)) { /* the peer should expect one more packet */ @@ -239,6 +252,7 @@ static inline int ompi_osc_pt2pt_flush_remote (ompi_osc_pt2pt_module_t *module, target, flush_req.frag_count)); /* send control message with unlock request and count */ + OSC_PT2PT_HTON(&flush_req, module, target); ret = ompi_osc_pt2pt_control_send (module, target, &flush_req, sizeof (flush_req)); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -682,6 +696,7 @@ static inline int activate_lock (ompi_osc_pt2pt_module_t *module, int requestor, lock_ack.source = ompi_comm_rank(module->comm); lock_ack.windx = ompi_comm_get_cid(module->comm); lock_ack.lock_ptr = lock_ptr; + OSC_PT2PT_HTON(&lock_ack, module, requestor); OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "osc pt2pt: sending lock to %d", requestor)); @@ -904,7 +919,16 @@ int ompi_osc_pt2pt_process_unlock (ompi_osc_pt2pt_module_t *module, int source, unlock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK; unlock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG + unlock_ack.padding[0] = 0; + unlock_ack.padding[1] = 0; + unlock_ack.padding[2] = 0; + unlock_ack.padding[3] = 0; + unlock_ack.padding[4] = 0; + unlock_ack.padding[5] = 0; +#endif unlock_ack.lock_ptr = unlock_header->lock_ptr; + OSC_PT2PT_HTON(&unlock_ack, module, source); ret = ompi_osc_pt2pt_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack)); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -942,6 +966,7 @@ int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source, flush_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK; flush_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID; flush_ack.serial_number = flush_header->serial_number; + OSC_PT2PT_HTON(&flush_ack, module, source); return ompi_osc_pt2pt_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack)); }