diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 33abef8044..8f70409e1c 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -8,7 +8,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. @@ -86,6 +86,12 @@ struct ompi_osc_rdma_component_t { /** Default value of the no_locks info key for new windows */ bool no_locks; + /** Accumulate operations will only operate on a single intrinsic datatype */ + bool acc_single_intrinsic; + + /** Use network AMOs when available */ + bool acc_use_amo; + /** Priority of the osc/rdma component */ unsigned int priority; @@ -121,12 +127,13 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** window should have accumulate ordering... */ - bool accumulate_ordering; - /** passive-target synchronization will not be used in this window */ bool no_locks; + bool acc_single_intrinsic; + + bool acc_use_amo; + /** flavor of this window */ int flavor; diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index a555cace89..ef62b24e46 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -25,7 +25,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count int ret = OMPI_SUCCESS; do { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local regions"); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local region(s)"); if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); @@ -70,7 +70,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count return ret; } -static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void *compare_buffer, void *result_buffer, +static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, ompi_osc_rdma_module_t *module) @@ -79,10 +79,10 @@ static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - memcpy (result_buffer, (void *) (uintptr_t) target_address, datatype->super.size); + memcpy (result_addr, (void *) (uintptr_t) target_address, datatype->super.size); - if (0 == memcmp (compare_buffer, result_buffer, datatype->super.size)) { - memcpy ((void *) (uintptr_t) target_address, source_buffer, datatype->super.size); + if (0 == memcmp (compare_addr, result_addr, datatype->super.size)) { + memcpy ((void *) (uintptr_t) target_address, source_addr, datatype->super.size); } ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); @@ -258,15 +258,19 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v return OMPI_SUCCESS; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "accumulate btl operation faile with opal error code %d", ret); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "accumulate btl operation failed with opal error code %d", ret); + + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } ompi_osc_rdma_cleanup_rdma (sync, frag, NULL, NULL); return ret; } -static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_buffer, int source_count, - ompi_datatype_t *source_datatype, void *result_buffer, int result_count, +static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_addr, int source_count, + ompi_datatype_t *source_datatype, void *result_addr, int result_count, ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) @@ -304,15 +308,15 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (source_datatype) { (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); - source_buffer = (void *)((intptr_t) source_buffer + lb); + source_addr = (void *)((intptr_t) source_addr + lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); - result_buffer = (void *)((intptr_t) result_buffer + lb); + result_addr = (void *)((intptr_t) result_addr + lb); } - ret = ompi_osc_rdma_gacc_contig (sync, source_buffer, source_count, source_datatype, result_buffer, + ret = ompi_osc_rdma_gacc_contig (sync, source_addr, source_count, source_datatype, result_addr, result_count, result_datatype, peer, target_address, target_handle, target_count, target_datatype, op, request); @@ -323,12 +327,12 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (source_datatype) { /* the convertors will handle the lb */ (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); - source_buffer = (void *)((intptr_t) source_buffer - lb); + source_addr = (void *)((intptr_t) source_addr - lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); - result_buffer = (void *)((intptr_t) result_buffer - lb); + result_addr = (void *)((intptr_t) result_addr - lb); } } @@ -362,7 +366,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v /* the source may be NULL if using MPI_OP_NO_OP with MPI_Get_accumulate */ if (source_datatype) { OBJ_CONSTRUCT(&source_convertor, opal_convertor_t); - ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_buffer, + ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_addr, 0, &source_convertor); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -427,7 +431,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (result_datatype) { /* prepare a convertor for this part of the result */ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count, - result_buffer, 0, &subreq->convertor); + result_addr, 0, &subreq->convertor); opal_convertor_set_position (&subreq->convertor, &result_position); subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC; } else { @@ -478,41 +482,206 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v return OMPI_SUCCESS; } -#if 0 static void ompi_osc_rdma_cas_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, void *context, void *data, int status) { ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; - void *result_buffer = (void *)(intptr_t) ((int64_t *) local_address)[1]; + void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; + size_t size = ((int64_t *) local_address)[2]; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic compare-and-swap complete. result: 0x%" PRIx64, *((int64_t *) local_address)); /* copy the result */ - memcpy (result_buffer, local_address, 8); + memcpy (result_addr, local_address, size); ompi_osc_rdma_sync_rdma_dec (sync); ompi_osc_rdma_frag_complete (frag); } -static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, - void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, +static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, + void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { ompi_osc_rdma_module_t *module = sync->module; + const size_t size = datatype->super.size; ompi_osc_rdma_frag_t *frag = NULL; + int64_t compare, source; + int ret, flags; char *ptr; - int ret; - /* XXX -- TODO -- Update the BTL interface to allow for other CAS sizes */ - if (datatype->super.size != 8) { + if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->selected_btl->btl_flags))) { return OMPI_ERR_NOT_SUPPORTED; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using 64-bit btl atomics. compare: 0x%" - PRIx64 ", origin: 0x%" PRIx64, *((int64_t *) compare_buffer), *((int64_t *) source_buffer)); + compare = (8 == size) ? ((int64_t *) compare_addr)[0] : ((int32_t *) compare_addr)[0]; + source = (8 == size) ? ((int64_t *) source_addr)[0] : ((int32_t *) source_addr)[0]; + flags = (4 == size) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%" + PRIx64 ", origin: 0x%" PRIx64, size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); + + ret = ompi_osc_rdma_frag_alloc (module, 24, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + /* store the destination and size in the temporary buffer */ + ((int64_t *) ptr)[1] = (intptr_t) result_addr; + ((int64_t *) ptr)[2] = size; + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address, + frag->handle, target_handle, compare, source, flags, MCA_BTL_NO_ORDER, + ompi_osc_rdma_cas_atomic_complete, sync, frag); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + + if (1 == ret) { + memcpy (result_addr, ptr, size); + ret = OMPI_SUCCESS; + } + + ompi_osc_rdma_frag_complete (frag); + } + + return ret; +} + +static inline void ompi_osc_rdma_fetch_and_op_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; + ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; + void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; + ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) (intptr_t) ((int64_t *) local_address)[2]; + size_t size = ((int64_t *) local_address)[3]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic fetch-and-op complete. result: 0x%" PRIx64, + *((int64_t *) local_address)); + + /* copy the result */ + if (result_addr) { + memcpy (result_addr, local_address, size); + } + + ompi_osc_rdma_sync_rdma_dec (sync); + ompi_osc_rdma_frag_complete (frag); + if (req) { + ompi_osc_rdma_request_complete (req, status); + } +} + +static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES] = { + [OMPI_OP_MAX] = MCA_BTL_ATOMIC_MAX, + [OMPI_OP_MIN] = MCA_BTL_ATOMIC_MIN, + [OMPI_OP_SUM] = MCA_BTL_ATOMIC_ADD, + [OMPI_OP_BAND] = MCA_BTL_ATOMIC_AND, + [OMPI_OP_BOR] = MCA_BTL_ATOMIC_OR, + [OMPI_OP_BXOR] = MCA_BTL_ATOMIC_XOR, + [OMPI_OP_LAND] = MCA_BTL_ATOMIC_LAND, + [OMPI_OP_LOR] = MCA_BTL_ATOMIC_LOR, + [OMPI_OP_LXOR] = MCA_BTL_ATOMIC_LXOR, + [OMPI_OP_REPLACE] = MCA_BTL_ATOMIC_SWAP, +}; + +static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, + ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + ompi_osc_rdma_frag_t *frag = NULL; + int ret, btl_op, flags; + char *ptr = NULL; + int64_t origin; + + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { + return OMPI_ERR_NOT_SUPPORTED; + } + + flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) { + flags |= MCA_BTL_ATOMIC_FLAG_FLOAT; + } + + btl_op = ompi_osc_rdma_op_mapping[op->op_type]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using %d-bit btl atomics. origin: 0x%" PRIx64, + (4 == extent) ? 32 : 64, *((int64_t *) origin_addr)); + + ret = ompi_osc_rdma_frag_alloc (module, 32, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0]; + + /* store the destination, request, and extent in the temporary buffer for the callback */ + ((int64_t *) ptr)[1] = (intptr_t) result_addr; + ((int64_t *) ptr)[2] = (intptr_t) req; + ((int64_t *) ptr)[3] = extent; + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->data_endpoint, ptr, target_address, + frag->handle, target_handle, btl_op, origin, flags, + MCA_BTL_NO_ORDER, ompi_osc_rdma_fetch_and_op_atomic_complete, + sync, frag); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + + if (OPAL_LIKELY(1 == ret)) { + memcpy (result_addr, ptr, extent); + if (req) { + ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); + } + ret = OPAL_SUCCESS; + } + + ompi_osc_rdma_frag_complete (frag); + } + + return ret; +} + +static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, + ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + ompi_osc_rdma_frag_t *frag = NULL; + uint64_t address, offset; + char *ptr = NULL; + int ret, btl_op; + + if (extent > 8) { + return OMPI_ERR_NOT_SUPPORTED; + } + + /* align the address. the user should not call with an unaligned address so don't need to range check here */ + address = target_address & ~7; + offset = target_address & ~address; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap. origin: 0x%" PRIx64, + *((int64_t *) origin_addr)); ret = ompi_osc_rdma_frag_alloc (module, 16, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -520,27 +689,124 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo } /* store the destination in the temporary buffer */ - ((int64_t *) ptr)[1] = (intptr_t) result_buffer; + do { + bool complete = false; - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address, - frag->handle, target_handle, ((int64_t *)compare_buffer)[0], - *((int64_t *) source_buffer), 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_atomic_complete, module, frag); - if (OPAL_UNLIKELY(0 > ret)) { - return ret; + ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, ptr, 8); + if (OMPI_SUCCESS != ret) { + ompi_osc_rdma_frag_complete (frag); + return ret; + } + + ((int64_t *) ptr)[1] = ((int64_t *) ptr)[0]; + + if (&ompi_mpi_op_no_op.op == op) { + memcpy (ptr + offset, origin_addr, extent); + } else { + ompi_op_reduce (op, (void *) origin_addr, ptr + offset, 1, dt); + } + + do { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, address, + frag->handle, target_handle, ((int64_t *) ptr)[1], + ((int64_t *) ptr)[0], 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_atomic_complete, (void *) &complete, NULL); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + break; + } + + while (!complete) { + ompi_osc_rdma_progress (module); + } + + if (((int64_t *) ptr)[1] == ((int64_t *) ptr)[0]) { + break; + } + } while (1); + + if (result_addr) { + memcpy (result_addr, ptr + 8 + offset, extent); } - if (1 != ret) { - ompi_osc_rdma_sync_rdma_inc (sync); - } else { - memcpy (result_buffer, ptr, 8); + ompi_osc_rdma_frag_complete (frag); - ompi_osc_rdma_frag_complete (frag); - } - - return OMPI_SUCCESS; + return ret; +} + +static void ompi_osc_rdma_acc_single_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; + ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) data; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic accumulate complete"); + + ompi_osc_rdma_sync_rdma_dec (sync); + if (req) { + ompi_osc_rdma_request_complete (req, status); + } +} + +static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, ompi_datatype_t *dt, ptrdiff_t extent, + ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, + ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + int ret, btl_op, flags; + int64_t origin; + + if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + /* btl put atomics not supported or disabled. fall back on fetch-and-op */ + return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, op, req); + } + + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { + return OMPI_ERR_NOT_SUPPORTED; + } + + origin = (8 == extent) ? ((uint64_t *) origin_addr)[0] : ((uint32_t *) origin_addr)[0]; + + /* set the appropriate flags for this atomic */ + flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) { + flags |= MCA_BTL_ATOMIC_FLAG_FLOAT; + } + + btl_op = ompi_osc_rdma_op_mapping[op->op_type]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate using 64-bit btl atomics. origin: 0x%" PRIx64, + *((int64_t *) origin_addr)); + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->data_endpoint, target_address, + target_handle, btl_op, origin, flags, MCA_BTL_NO_ORDER, + ompi_osc_rdma_acc_single_atomic_complete, sync, req); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + if (1 == ret) { + if (req) { + ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); + } + ret = OMPI_SUCCESS; + } + } + + return ret; } -#endif /** * ompi_osc_rdma_cas_get_complete: @@ -561,45 +827,49 @@ static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, s OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "remote compare-and-swap get complete on sync %p. " "status %d", (void *) sync, status); - if (OMPI_SUCCESS == status) { - /* copy data to the user buffer (for gacc) */ - memcpy (request->result_addr, (void *) source, request->len); - - if (0 == memcmp ((void *) source, request->compare_addr, request->len)) { - /* the target and compare buffers match so write the source to the target */ - memcpy ((void *) source, request->origin_addr, request->len); - - ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address, - request->target_address, local_handle, - (mca_btl_base_registration_handle_t *) request->ctx, - request->len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_acc_put_complete, request, NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code " - "%d", ret); - } - - /* TODO -- we can do better. probably should queue up the next step and handle it in progress */ - assert (OPAL_SUCCESS == ret); - } else { - /* this is a no-op. nothing more to do except release the accumulate lock */ - ompi_osc_rdma_frag_complete (frag); - - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer, - offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } - - /* the request is now complete and the outstanding rdma operation is complete */ - ompi_osc_rdma_request_complete (request, status); - - ompi_osc_rdma_sync_rdma_dec (sync); - peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; - } + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + return; } + + /* copy data to the user buffer (for gacc) */ + memcpy (request->result_addr, (void *) source, request->len); + + if (0 == memcmp ((void *) source, request->compare_addr, request->len)) { + /* the target and compare buffers match. write the source to the target */ + memcpy ((void *) source, request->origin_addr, request->len); + + ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address, + request->target_address, local_handle, + (mca_btl_base_registration_handle_t *) request->ctx, + request->len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_acc_put_complete, request, NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code " + "%d", ret); + } + + /* TODO -- we can do better. probably should queue up the next step and handle it in progress */ + assert (OPAL_SUCCESS == ret); + + return; + } + + /* this is a no-op. nothing more to do except release the accumulate lock */ + ompi_osc_rdma_frag_complete (frag); + + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer, + offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } + + /* the request is now complete and the outstanding rdma operation is complete */ + ompi_osc_rdma_request_complete (request, status); + + ompi_osc_rdma_sync_rdma_dec (sync); + peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; } -static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, void *result_buffer, +static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { @@ -649,10 +919,10 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe /* set up the request */ request->frag = frag; - request->origin_addr = (void *) source_buffer; + request->origin_addr = (void *) source_addr; request->ctx = (void *) target_handle; - request->result_addr = result_buffer; - request->compare_addr = compare_buffer; + request->result_addr = result_addr; + request->compare_addr = compare_addr; request->result_dt = datatype; request->offset = (ptrdiff_t) offset; request->target_address = target_address; @@ -670,6 +940,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe } if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } ompi_osc_rdma_frag_complete (frag); return ret; } @@ -684,8 +957,8 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr, - struct ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, - struct ompi_win_t *win) + ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, + ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -708,8 +981,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare return ret; } -#if 0 - if (MCA_OSC_RDMA_SAME_OP <= module->accumulate_ops) { + if (win->w_acc_ops <= OMPI_WIN_ACCUMULATE_OPS_SAME_OP) { /* the user has indicated that they will only use the same op (or same op and no op) * for operations on overlapping memory ranges. that indicates it is safe to go ahead * and use network atomic operations. */ @@ -718,8 +990,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare if (OMPI_SUCCESS == ret) { return OMPI_SUCCESS; } - } else -#endif + } if (ompi_osc_rdma_peer_local_base (peer)) { return ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt, @@ -733,15 +1004,16 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare static inline int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, + ompi_datatype_t *origin_datatype, void *result_addr, int result_count, + ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, int target_rank, MPI_Aint target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, + ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; + ptrdiff_t lb, extent; int ret; /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ @@ -753,12 +1025,35 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo return OMPI_SUCCESS; } - ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count, - &target_address, &target_handle); + (void) ompi_datatype_get_extent (origin_datatype, &lb, &extent); + + ret = osc_rdma_get_remote_segment (module, peer, target_disp, extent * target_count, &target_address, &target_handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } + if (module->acc_single_intrinsic && extent <= 8) { + if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) { + if (NULL == result_addr) { + ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + } else { + ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + } + + if (OMPI_SUCCESS == ret) { + return OMPI_SUCCESS; + } + } + + ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + if (OMPI_SUCCESS == ret) { + return OMPI_SUCCESS; + } + } + if (ompi_osc_rdma_peer_local_base (peer)) { /* local/self optimization */ return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count, @@ -771,13 +1066,10 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo target_datatype, op, request); } -int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win) +int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -801,14 +1093,10 @@ int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, } -int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win, - ompi_request_t **request) +int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win, ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -842,31 +1130,9 @@ int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, return OMPI_SUCCESS; } -int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, struct ompi_win_t *win) -{ - ompi_osc_rdma_module_t *module = GET_MODULE(win); - ompi_osc_rdma_peer_t *peer; - ompi_osc_rdma_sync_t *sync; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name, target_rank, - (unsigned long) target_disp, op->o_name, win->w_name); - - sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); - if (OPAL_UNLIKELY(NULL == sync)) { - return OMPI_ERR_RMA_SYNC; - } - - return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, target_rank, - target_disp, 1, dt, op, NULL); -} - - -int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, - struct ompi_win_t *win, struct ompi_request_t **request) +int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win, ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -898,11 +1164,9 @@ int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, return OMPI_SUCCESS; } -int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, - struct ompi_win_t *win) +int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -921,3 +1185,24 @@ int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, NULL, peer, target_rank, target_disp, target_count, target_datatype, op, NULL); } + + +int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi_datatype_t *dt, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, ompi_op_t *op, ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_peer_t *peer; + ompi_osc_rdma_sync_t *sync; + int ret; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name, + target_rank, (unsigned long) target_disp, op->o_name, win->w_name); + + sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); + if (OPAL_UNLIKELY(NULL == sync)) { + return OMPI_ERR_RMA_SYNC; + } + + return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, + target_rank, target_disp, 1, dt, op, NULL); +} diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.h b/ompi/mca/osc/rdma/osc_rdma_accumulate.h index 8f6f1bb4b7..7ab370ab2b 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.h +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -14,44 +14,30 @@ #include "osc_rdma.h" -int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, - void *result_addr, struct ompi_datatype_t *dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - struct ompi_win_t *win); +int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr, + ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, + ompi_win_t *win); -int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, struct ompi_win_t *win); -int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, - struct ompi_datatype_t *dt, int target, - OPAL_PTRDIFF_TYPE target_disp, - struct ompi_op_t *op, struct ompi_win_t *win); +int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win); -int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win); +int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi_datatype_t *dt, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, ompi_op_t *op, ompi_win_t *win); -int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, struct ompi_win_t *win, - struct ompi_request_t **request); +int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win); -int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win, - struct ompi_request_t **request); +int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win, ompi_request_t **request); + +int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win, ompi_request_t **request); #endif /* OSC_RDMA_ACCUMULATE_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 5d9335613c..052e4db69c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -772,9 +772,9 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, + ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) + ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *target_handle; @@ -807,9 +807,9 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_osc_rdma_put_contig, false); } -static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) + ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *source_handle; @@ -841,9 +841,9 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori source_handle, source_count, source_datatype, request, module->selected_btl->btl_get_limit, ompi_osc_rdma_get_contig, true); } -int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, ompi_win_t *win) + ompi_datatype_t *target_datatype, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -862,10 +862,10 @@ int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_da target_count, target_datatype, NULL); } -int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_win_t *win, - struct ompi_request_t **request) + ompi_datatype_t *target_datatype, ompi_win_t *win, + ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -897,9 +897,9 @@ int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_d return OMPI_SUCCESS; } -int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, struct ompi_win_t *win) + ompi_datatype_t *source_datatype, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -918,10 +918,10 @@ int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype source_disp, source_count, source_datatype, NULL); } -int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, struct ompi_win_t *win, - struct ompi_request_t **request) + ompi_datatype_t *source_datatype, ompi_win_t *win, + ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index c011eea3ed..e9b048c56e 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -96,23 +96,23 @@ static inline int osc_rdma_get_remote_segment (ompi_osc_rdma_module_t *module, o /* prototypes for implementations of MPI RMA window functions. these will be called from the * mpi interface (ompi/mpi/c) */ -int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win); + ompi_datatype_t *target_dt, ompi_win_t *win); -int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win); + ompi_datatype_t *target_dt, ompi_win_t *win); -int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win, - struct ompi_request_t **request); + ompi_datatype_t *target_dt, ompi_win_t *win, + ompi_request_t **request); -int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win, - struct ompi_request_t **request); + ompi_datatype_t *target_dt, ompi_win_t *win, + ompi_request_t **request); /** * @brief read data from a remote memory region (blocking) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 8b52933007..c951a76761 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -173,6 +173,20 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks); + mca_osc_rdma_component.acc_single_intrinsic = false; + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic", + "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes " + "that will not use anything more than a single predefined datatype (default: false)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic); + + mca_osc_rdma_component.acc_use_amo = true; + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", + "Enable the use of network atomic memory operations when using single " + "intrinsic optimizations. If not set network compare-and-swap will be " + "used instread (default: true)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_use_amo); + mca_osc_rdma_component.buffer_size = 32768; (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", "Size of temporary buffers (default: 32k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, @@ -585,7 +599,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } } - if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) { + if (MPI_WIN_FLAVOR_CREATE == module->flavor) { ret = ompi_osc_rdma_initialize_region (module, base, size); if (OMPI_SUCCESS != ret) { break; @@ -600,6 +614,20 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s opal_shmem_unlink (&module->seg_ds); } + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *) module->state->regions; + module->state->disp_unit = module->disp_unit; + module->state->region_count = 1; + region->base = state_region->base + my_base_offset; + region->len = size; + if (module->selected_btl->btl_register_mem) { + memcpy (region->btl_handle_data, state_region->btl_handle_data, module->selected_btl->btl_registration_handle_size); + } + } + + /* barrier to make sure all ranks have attached */ + shared_comm->c_coll.coll_barrier(shared_comm, shared_comm->c_coll.coll_barrier_module); + offset = data_base; for (int i = 0 ; i < local_size ; ++i) { ompi_osc_rdma_peer_extended_t *ex_peer; @@ -646,21 +674,18 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { if (temp[i].size) { - ex_peer->super.base = (uint64_t) (uintptr_t) module->segment_base + offset; + ex_peer->super.base = state_region->base + offset; + offset += temp[i].size; } else { ex_peer->super.base = 0; } + } - peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; + ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions; - offset += temp[i].size; - } else { - ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions; - - ex_peer->super.base = peer_region->base; - if (module->selected_btl->btl_register_mem) { - ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data; - } + ex_peer->super.base = peer_region->base; + if (module->selected_btl->btl_register_mem) { + ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data; } } @@ -1020,6 +1045,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->same_disp_unit = check_config_value_bool ("same_disp_unit", info); module->same_size = check_config_value_bool ("same_size", info); module->no_locks = check_config_value_bool ("no_locks", info); + module->acc_single_intrinsic = check_config_value_bool ("ompi_single_accumulate", info); + module->acc_use_amo = mca_osc_rdma_component.acc_use_amo; module->all_sync.module = module; @@ -1047,14 +1074,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } } - /* options */ - /* FIX ME: should actually check this value... */ -#if 1 - module->accumulate_ordering = 1; -#else - ompi_osc_base_config_value_equal("accumulate_ordering", info, "none"); -#endif - ret = ompi_comm_dup(comm, &module->comm); if (OMPI_SUCCESS != ret) { ompi_osc_rdma_free (win); @@ -1132,17 +1151,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } } - ret = ompi_osc_rdma_share_data (module); - if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); - ompi_osc_rdma_free (win); - return ret; - } - - - /* for now the leader is always rank 0 in the communicator */ - module->leader = ompi_osc_rdma_module_peer (module, 0); - /* lock data */ if (module->no_locks) { win->w_flags |= OMPI_WIN_NO_LOCKS; @@ -1177,20 +1185,19 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* sync memory - make sure all initialization completed */ opal_atomic_mb(); - /* barrier to prevent arrival of lock requests before we're - fully created */ - ret = module->comm->c_coll.coll_barrier(module->comm, - module->comm->c_coll.coll_barrier_module); + ret = ompi_osc_rdma_share_data (module); if (OMPI_SUCCESS != ret) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); ompi_osc_rdma_free (win); - return ret; + } else { + /* for now the leader is always rank 0 in the communicator */ + module->leader = ompi_osc_rdma_module_peer (module, 0); + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d", + ompi_comm_get_cid(module->comm)); } - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d", - ompi_comm_get_cid(module->comm)); - - return OMPI_SUCCESS; + return ret; } diff --git a/ompi/mca/osc/rdma/osc_rdma_passive_target.c b/ompi/mca/osc/rdma/osc_rdma_passive_target.c index f3e1a0ac85..720fbbb64a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_passive_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_passive_target.c @@ -43,12 +43,6 @@ int ompi_osc_rdma_flush (int target, struct ompi_win_t *win) OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush: %d, %s", target, win->w_name); - if (ompi_comm_rank (module->comm) == target) { - /* nothing to flush. call one round of progress */ - ompi_osc_rdma_progress (module); - return OMPI_SUCCESS; - } - OPAL_THREAD_LOCK(&module->lock); lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer); diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index e5f2f5e185..8008512403 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -218,6 +218,10 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd memcpy (ex_peer->super.base_handle, base_region->btl_handle_data, registration_handle_size); } + + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + ex_peer->super.super.data_endpoint = ex_peer->super.super.state_endpoint; + } } return OMPI_SUCCESS; diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.h b/ompi/mca/osc/rdma/osc_rdma_peer.h index 34fb22a388..6716733a43 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.h +++ b/ompi/mca/osc/rdma/osc_rdma_peer.h @@ -75,6 +75,9 @@ struct ompi_osc_rdma_peer_basic_t { /** remote peer's base pointer */ osc_rdma_base_t base; + /** local pointer to peer's base */ + osc_rdma_base_t local_base; + /** registration handle associated with the base */ mca_btl_base_registration_handle_t *base_handle; };