diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 1a068358f7..8287163680 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -8,7 +8,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. @@ -50,6 +50,11 @@ #include "opal_stdint.h" +enum { + OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, + OMPI_OSC_RDMA_LOCKING_ON_DEMAND, +}; + /** * @brief osc rdma component structure */ @@ -87,6 +92,9 @@ struct ompi_osc_rdma_component_t { /** Default value of the no_locks info key for new windows */ bool no_locks; + /** Locking mode to use as the default for all windows */ + int locking_mode; + /** Accumulate operations will only operate on a single intrinsic datatype */ bool acc_single_intrinsic; @@ -119,6 +127,8 @@ struct ompi_osc_rdma_module_t { /** Mutex lock protecting module data */ opal_mutex_t lock; + /** locking mode to use */ + int locking_mode; /* window configuration */ @@ -147,10 +157,12 @@ struct ompi_osc_rdma_module_t { /** Local displacement unit. */ int disp_unit; - /** global leader */ ompi_osc_rdma_peer_t *leader; + /** my peer structure */ + ompi_osc_rdma_peer_t *my_peer; + /** pointer to free on cleanup (may be NULL) */ void *free_after; @@ -276,6 +288,16 @@ int ompi_osc_rdma_free (struct ompi_win_t *win); */ int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer); +/** + * @brief demand lock a peer + * + * @param[in] module osc rdma module + * @param[in] peer peer to lock + * + * @returns OMPI_SUCCESS on success + */ +int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer); + /** * @brief check if a peer object is cached for a remote rank * @@ -449,10 +471,18 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r } return NULL; - case OMPI_OSC_RDMA_SYNC_TYPE_FENCE: case OMPI_OSC_RDMA_SYNC_TYPE_LOCK: - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence/lock_all access epoch for target %d", target); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found lock_all access epoch for target %d", target); + *peer = ompi_osc_rdma_module_peer (module, target); + if (OPAL_UNLIKELY(OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode && + !ompi_osc_rdma_peer_is_demand_locked (*peer))) { + ompi_osc_rdma_demand_lock_peer (module, *peer); + } + + return &module->all_sync; + case OMPI_OSC_RDMA_SYNC_TYPE_FENCE: + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence access epoch for target %d", target); /* fence epoch is now active */ module->all_sync.epoch_active = true; *peer = ompi_osc_rdma_module_peer (module, target); @@ -470,6 +500,62 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r return NULL; } +static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module) +{ +#if defined(BTL_VERSION) && (BTL_VERSION >= 310) + return !!(module->selected_btl->btl_flush); +#else + return false; +#endif +} + +/** + * @brief increment the outstanding rdma operation counter (atomic) + * + * @param[in] rdma_sync osc rdma synchronization object + */ +static inline void ompi_osc_rdma_sync_rdma_inc_always (ompi_osc_rdma_sync_t *rdma_sync) +{ + ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, 1); + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "inc: there are %ld outstanding rdma operations", + (unsigned long) rdma_sync->outstanding_rdma.counter); +} + +static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync) +{ +#if defined(BTL_VERSION) && (BTL_VERSION >= 310) + if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) { + return; + } +#endif + ompi_osc_rdma_sync_rdma_inc_always (rdma_sync); +} + +/** + * @brief decrement the outstanding rdma operation counter (atomic) + * + * @param[in] rdma_sync osc rdma synchronization object + */ +static inline void ompi_osc_rdma_sync_rdma_dec_always (ompi_osc_rdma_sync_t *rdma_sync) +{ + opal_atomic_wmb (); + ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, -1); + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "dec: there are %ld outstanding rdma operations", + (unsigned long) rdma_sync->outstanding_rdma.counter); +} + +static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync) +{ +#if defined(BTL_VERSION) && (BTL_VERSION >= 310) + if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) { + return; + } +#endif + ompi_osc_rdma_sync_rdma_dec_always (rdma_sync); +} + /** * @brief complete all outstanding rdma operations to all peers * @@ -477,18 +563,31 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r */ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync) { - ompi_osc_rdma_aggregation_t *aggregation, *next; - if (opal_list_get_size (&sync->aggregations)) { + ompi_osc_rdma_aggregation_t *aggregation, *next; + OPAL_THREAD_SCOPED_LOCK(&sync->lock, OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) { + fprintf (stderr, "Flushing aggregation %p, peeer %p\n", aggregation, aggregation->peer); ompi_osc_rdma_peer_aggregate_flush (aggregation->peer); }); } +#if !defined(BTL_VERSION) || (BTL_VERSION < 310) do { opal_progress (); - } while (sync->outstanding_rdma); + } while (ompi_osc_rdma_sync_get_count (sync)); +#else + mca_btl_base_module_t *btl_module = sync->module->selected_btl; + + do { + if (!ompi_osc_rdma_use_btl_flush (sync->module)) { + opal_progress (); + } else { + btl_module->btl_flush (btl_module, NULL); + } + } while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1))); +#endif } /** diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index dc49668d16..4ccc68db6b 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -18,6 +18,16 @@ #include "ompi/mca/osc/base/osc_base_obj_convert.h" +static inline void ompi_osc_rdma_peer_accumulate_cleanup (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, bool lock_acquired) +{ + if (lock_acquired) { + (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } + + /* clear out the accumulation flag */ + ompi_osc_rdma_peer_clear_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING); +} + enum ompi_osc_rdma_event_type_t { OMPI_OSC_RDMA_EVENT_TYPE_PUT, }; @@ -70,6 +80,8 @@ static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca ompi_osc_rdma_event_t *event = malloc (sizeof (*event)); void *(*event_func) (int, int, void *); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "queueing event type %d", event_type); + if (OPAL_UNLIKELY(NULL == event)) { return OMPI_ERR_OUT_OF_RESOURCE; } @@ -107,17 +119,13 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module, - ompi_osc_rdma_request_t *request) + ompi_osc_rdma_request_t *request, bool lock_acquired) { int ret = OMPI_SUCCESS; do { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local region(s)"); - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } - if (NULL != result_buffer) { /* get accumulate */ @@ -138,12 +146,10 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count target_count, target_datatype); } } - - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } } while (0); + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "local accumulate failed with ompi error code %d", ret); return ret; @@ -160,113 +166,24 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, - ompi_osc_rdma_module_t *module) + ompi_osc_rdma_module_t *module, bool lock_acquired) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing compare-and-swap with local regions"); - ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - memcpy (result_addr, (void *) (uintptr_t) target_address, datatype->super.size); if (0 == memcmp (compare_addr, result_addr, datatype->super.size)) { memcpy ((void *) (uintptr_t) target_address, source_addr, datatype->super.size); } - ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); return OMPI_SUCCESS; } -/* completion of an accumulate put */ -static void ompi_osc_rdma_acc_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context; - ompi_osc_rdma_sync_t *sync = request->sync; - ompi_osc_rdma_peer_t *peer = request->peer; - - OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "remote accumulate (put/get) complete on " - "sync %p. local address %p. opal status %d", (void *) sync, local_address, status); - - ompi_osc_rdma_frag_complete (request->frag); - ompi_osc_rdma_request_complete (request, status); - - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (sync->module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } - - ompi_osc_rdma_sync_rdma_dec (sync); - ompi_osc_rdma_peer_clear_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING); -} - -/* completion of an accumulate get operation */ -static void ompi_osc_rdma_acc_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context; - intptr_t source = (intptr_t) local_address + request->offset; - ompi_osc_rdma_sync_t *sync = request->sync; - ompi_osc_rdma_module_t *module = sync->module; - - assert (OMPI_SUCCESS == status); - - OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "remote accumulate get complete on sync %p. " - "status %d. request type %d", (void *) sync, status, request->type); - - if (OMPI_SUCCESS == status && OMPI_OSC_RDMA_TYPE_GET_ACC == request->type) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unpacking get accumulate result into user buffer"); - if (NULL == request->result_addr) { - /* result buffer is not necessarily contiguous. use the opal datatype engine to - * copy the data over in this case */ - struct iovec iov = {.iov_base = (void *) source, request->len}; - uint32_t iov_count = 1; - size_t size = request->len; - - opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size); - opal_convertor_cleanup (&request->convertor); - } else { - /* copy contiguous data to the result buffer */ - ompi_datatype_sndrcv ((void *) source, request->len, MPI_BYTE, request->result_addr, - request->result_count, request->result_dt); - } - - if (&ompi_mpi_op_no_op.op == request->op) { - /* this is a no-op. nothing more to do except release resources and the accumulate lock */ - ompi_osc_rdma_acc_put_complete (btl, endpoint, local_address, local_handle, context, data, status); - - return; - } - } - - /* accumulate the data */ - if (&ompi_mpi_op_replace.op != request->op) { - ompi_op_reduce (request->op, request->origin_addr, (void *) source, request->origin_count, request->origin_dt); - } else { - memcpy ((void *) source, request->origin_addr, request->len); - } - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "putting locally accumulated result into target window"); - - /* initiate the put of the accumulated data */ - status = module->selected_btl->btl_put (module->selected_btl, endpoint, (void *) source, - request->target_address, local_handle, - (mca_btl_base_registration_handle_t *) request->ctx, - request->len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete, - request, NULL); - if (OPAL_SUCCESS != status) { - status = ompi_osc_rdma_event_queue (module, endpoint, OMPI_OSC_RDMA_EVENT_TYPE_PUT, (void *) source, local_handle, - request->target_address, (mca_btl_base_registration_handle_t *) request->ctx, - request->len, ompi_osc_rdma_acc_put_complete, request, NULL); - } - - assert (OPAL_SUCCESS == status); -} - -static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count, ompi_datatype_t *source_datatype, - void *result, int result_count, ompi_datatype_t *result_datatype, - ompi_osc_rdma_peer_t *peer, uint64_t target_address, +static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count, + ompi_datatype_t *source_datatype, void *result, int result_count, + ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) { @@ -274,90 +191,69 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); unsigned long len = target_count * target_datatype->super.size; ompi_osc_rdma_frag_t *frag = NULL; - unsigned long aligned_len, offset; + volatile bool complete = false; char *ptr = NULL; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate on contiguous region of %lu bytes to remote address %" PRIx64 ", sync %p", len, target_address, (void *) sync); - offset = target_address & btl_alignment_mask;; - aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask; - - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not allocate a temporary buffer for accumulate"); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - OPAL_THREAD_LOCK(&module->lock); - /* to ensure order wait until the previous accumulate completes */ - while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) { - OPAL_THREAD_UNLOCK(&module->lock); - ompi_osc_rdma_progress (module); - OPAL_THREAD_LOCK(&module->lock); - } - - OPAL_THREAD_UNLOCK(&module->lock); - - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } - - /* set up the request */ - request->frag = frag; - request->origin_addr = (void *) source; - request->origin_dt = source_datatype; - request->origin_count = source_count; - request->ctx = (void *) target_handle; - request->result_addr = result; - request->result_count = result_count; - request->result_dt = result_datatype; - request->offset = (ptrdiff_t) target_address & btl_alignment_mask; - request->target_address = target_address; - request->len = len; - request->op = op; - request->sync = sync; - - ompi_osc_rdma_sync_rdma_inc (sync); - if (&ompi_mpi_op_replace.op != op || OMPI_OSC_RDMA_TYPE_GET_ACC == request->type) { - /* align the target address */ - target_address = target_address & ~btl_alignment_mask; + ptr = malloc (len); + if (OPAL_UNLIKELY(NULL == ptr)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not allocate a temporary buffer for accumulate"); + return OMPI_ERR_OUT_OF_RESOURCE; + } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl get. local: %p (handle %p), remote: 0x%" PRIx64 - " (handle %p)", (void*)ptr, (void *) frag->handle, target_address, (void *) target_handle); + /* set up the request */ + request->to_free = ptr; - ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, - target_address, frag->handle, target_handle, aligned_len, - 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_get_complete, - request, NULL); - } else { - /* copy the put accumulate data */ - memcpy (ptr, source, len); + ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, ptr, len); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put. local: %p (handle %p), remote: 0x%" PRIx64 - " (handle %p)", (void*)ptr, (void *) frag->handle, target_address, (void *) target_handle); + if (OMPI_OSC_RDMA_TYPE_GET_ACC == request->type) { + if (NULL == result) { + /* result buffer is not necessarily contiguous. use the opal datatype engine to + * copy the data over in this case */ + struct iovec iov = {.iov_base = ptr, len}; + uint32_t iov_count = 1; + size_t size = request->len; - ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, - target_address, frag->handle, target_handle, len, 0, - MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete, - request, NULL); - } + opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size); + opal_convertor_cleanup (&request->convertor); + } else { + /* copy contiguous data to the result buffer */ + ompi_datatype_sndrcv (ptr, len, MPI_BYTE, result, result_count, result_datatype); + } + } + + if (&ompi_mpi_op_replace.op == op) { + return ompi_osc_rdma_put_contig (sync, peer, target_address, target_handle, (void *) source, len, request); + } + + if (&ompi_mpi_op_no_op.op != op) { + /* NTH: need to cast away const for the source buffer. the buffer will not be modified by this call */ + ompi_op_reduce (op, (void *) source, ptr, source_count, source_datatype); + + return ompi_osc_rdma_put_contig (sync, peer, target_address, target_handle, ptr, len, request); + } + + if (request) { + /* nothing more to do for this request */ + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "accumulate btl operation failed with opal error code %d", ret); + return ompi_osc_rdma_put_contig (sync, peer, target_address, target_handle, (void *) source, len, request); +} - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } - - ompi_osc_rdma_cleanup_rdma (sync, frag, NULL, NULL); - - return ret; +static void ompi_osc_rdma_gacc_master_cleanup (ompi_osc_rdma_request_t *request) +{ + ompi_osc_rdma_peer_accumulate_cleanup (request->module, request->peer, !ompi_osc_rdma_peer_is_exclusive (request->peer)); } static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_addr, int source_count, @@ -382,6 +278,14 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v int ret, acc_len; bool done; + if (!request) { + OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request); + request->internal = true; + } + + request->cleanup = ompi_osc_rdma_gacc_master_cleanup; + request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC; + (void) ompi_datatype_get_extent (target_datatype, &lb, &extent); target_address += lb; @@ -390,13 +294,6 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v ompi_datatype_is_predefined (target_datatype) && (!result_count || ompi_datatype_is_predefined (result_datatype)) && (target_datatype->super.size * target_count <= acc_limit))) { - if (NULL == request) { - OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request); - request->internal = true; - } - - request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC; - if (source_datatype) { (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); source_addr = (void *)((intptr_t) source_addr + lb); @@ -472,14 +369,13 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v return ret; } - if (request) { - /* keep the request from completing until all the transfers have started */ - request->outstanding_requests = 1; - } + /* keep the request from completing until all the transfers have started */ + request->outstanding_requests = 1; target_iov_index = 0; target_iov_count = 0; result_position = 0; + subreq = NULL; do { /* decode segments of the source data */ @@ -512,10 +408,10 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v acc_len = min((size_t) acc_len, acc_limit); /* execute the get */ - OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq); - subreq->internal = true; - subreq->parent_request = request; - if (request) { + if (!subreq) { + OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq); + subreq->internal = true; + subreq->parent_request = request; (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1); } @@ -530,10 +426,13 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v } ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size, - target_primitive, NULL, 0, NULL, peer, (uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base, - target_handle, acc_len / target_primitive->super.size, target_primitive, op, subreq); + target_primitive, NULL, 0, NULL, peer, + (uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base, target_handle, + acc_len / target_primitive->super.size, target_primitive, op, subreq); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) { + OMPI_OSC_RDMA_REQUEST_RETURN(subreq); + (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1); /* something bad happened. need to figure out how to handle these errors */ return ret; } @@ -543,6 +442,8 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v continue; } + subreq = NULL; + /* adjust io vectors */ target_iovec[target_iov_index].iov_len -= acc_len; source_iovec[source_iov_index].iov_len -= acc_len; @@ -555,10 +456,8 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v } } while (!done); - if (request) { - /* release our reference so the request can complete */ - (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1); - } + /* release our reference so the request can complete */ + ompi_osc_rdma_request_deref (request); if (source_datatype) { opal_convertor_cleanup (&source_convertor); @@ -573,35 +472,15 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v return OMPI_SUCCESS; } -static void ompi_osc_rdma_cas_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; - ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; - void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; - size_t size = ((int64_t *) local_address)[2]; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic compare-and-swap complete. result: 0x%" PRIx64, - *((int64_t *) local_address)); - - /* copy the result */ - memcpy (result_addr, local_address, size); - - ompi_osc_rdma_sync_rdma_dec (sync); - ompi_osc_rdma_frag_complete (frag); -} - static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, - uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) + uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, + bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; const size_t size = datatype->super.size; - ompi_osc_rdma_frag_t *frag = NULL; int64_t compare, source; int ret, flags; - char *ptr; if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->selected_btl->btl_flags))) { return OMPI_ERR_NOT_SUPPORTED; @@ -614,65 +493,16 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%" PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); - ret = ompi_osc_rdma_frag_alloc (module, 24, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - - /* store the destination and size in the temporary buffer */ - ((int64_t *) ptr)[1] = (intptr_t) result_addr; - ((int64_t *) ptr)[2] = size; - - ompi_osc_rdma_sync_rdma_inc (sync); - - do { - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address, - frag->handle, target_handle, compare, source, flags, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_atomic_complete, sync, frag); - - ompi_osc_rdma_progress (module); - } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); - - if (OPAL_SUCCESS != ret) { - ompi_osc_rdma_sync_rdma_dec (sync); - - if (1 == ret) { - memcpy (result_addr, ptr, size); - ret = OMPI_SUCCESS; - } - - ompi_osc_rdma_frag_complete (frag); + ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, target_address, target_handle, compare, source, flags, + result_addr); + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); } return ret; } -static inline void ompi_osc_rdma_fetch_and_op_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; - ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; - void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; - ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) (intptr_t) ((int64_t *) local_address)[2]; - size_t size = ((int64_t *) local_address)[3]; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic fetch-and-op complete. result: 0x%" PRIx64, - *((int64_t *) local_address)); - - /* copy the result */ - if (result_addr) { - memcpy (result_addr, local_address, size); - } - - ompi_osc_rdma_sync_rdma_dec (sync); - ompi_osc_rdma_frag_complete (frag); - if (req) { - ompi_osc_rdma_request_complete (req, status); - } -} - -static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES] = { +static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES + 1] = { [OMPI_OP_MAX] = MCA_BTL_ATOMIC_MAX, [OMPI_OP_MIN] = MCA_BTL_ATOMIC_MIN, [OMPI_OP_SUM] = MCA_BTL_ATOMIC_ADD, @@ -687,14 +517,13 @@ static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES] = { static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, - mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req, + bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; int32_t atomic_flags = module->selected_btl->btl_atomic_flags; - ompi_osc_rdma_frag_t *frag = NULL; int ret, btl_op, flags; - char *ptr = NULL; - int64_t origin; + int64_t origin, result; if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || @@ -702,51 +531,30 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const return OMPI_ERR_NOT_SUPPORTED; } + btl_op = ompi_osc_rdma_op_mapping[op->op_type]; + if (0 == btl_op) { + return OMPI_ERR_NOT_SUPPORTED; + } + flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) { flags |= MCA_BTL_ATOMIC_FLAG_FLOAT; } - btl_op = ompi_osc_rdma_op_mapping[op->op_type]; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using %d-bit btl atomics. origin: 0x%" PRIx64, (4 == extent) ? 32 : 64, *((int64_t *) origin_addr)); - ret = ompi_osc_rdma_frag_alloc (module, 32, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } - origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0]; - /* store the destination, request, and extent in the temporary buffer for the callback */ - ((int64_t *) ptr)[1] = (intptr_t) result_addr; - ((int64_t *) ptr)[2] = (intptr_t) req; - ((int64_t *) ptr)[3] = extent; + ret = ompi_osc_rdma_btl_fop (module, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags, + result_addr, true, NULL, NULL, NULL); + if (OPAL_SUCCESS == ret) { + /* done. release the lock */ + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); - ompi_osc_rdma_sync_rdma_inc (sync); - - do { - ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->data_endpoint, ptr, target_address, - frag->handle, target_handle, btl_op, origin, flags, - MCA_BTL_NO_ORDER, ompi_osc_rdma_fetch_and_op_atomic_complete, - sync, frag); - - ompi_osc_rdma_progress (module); - } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); - - if (OPAL_SUCCESS != ret) { - ompi_osc_rdma_sync_rdma_dec (sync); - - if (OPAL_LIKELY(1 == ret)) { - memcpy (result_addr, ptr, extent); - if (req) { - ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); - } - ret = OPAL_SUCCESS; + if (req) { + ompi_osc_rdma_request_complete (req, MPI_SUCCESS); } - - ompi_osc_rdma_frag_complete (frag); } return ret; @@ -754,12 +562,11 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, - mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req, + bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - ompi_osc_rdma_frag_t *frag = NULL; - uint64_t address, offset; - char *ptr = NULL; + uint64_t address, offset, new_value, old_value; int ret; if (extent > 8) { @@ -773,78 +580,49 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap. origin: 0x%" PRIx64, *((int64_t *) origin_addr)); - ret = ompi_osc_rdma_frag_alloc (module, 16, &frag, &ptr); + ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, &old_value, 8); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } /* store the destination in the temporary buffer */ do { - volatile bool complete = false; + new_value = old_value; - ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, ptr, 8); - if (OMPI_SUCCESS != ret) { - ompi_osc_rdma_frag_complete (frag); - return ret; + if (&ompi_mpi_op_replace.op == op) { + memcpy ((void *)((intptr_t) &new_value) + offset, origin_addr, extent); + } else if (&ompi_mpi_op_no_op.op != op) { + ompi_op_reduce (op, (void *) origin_addr, (void *)((intptr_t) &new_value) + offset, 1, dt); } - ((int64_t *) ptr)[1] = ((int64_t *) ptr)[0]; - - if (&ompi_mpi_op_no_op.op == op) { - memcpy (ptr + offset, origin_addr, extent); - } else { - ompi_op_reduce (op, (void *) origin_addr, ptr + offset, 1, dt); - } - - do { - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, address, - frag->handle, target_handle, ((int64_t *) ptr)[1], - ((int64_t *) ptr)[0], 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_atomic_complete, (void *) &complete, NULL); - - ompi_osc_rdma_progress (module); - } while (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); - - if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle, + old_value, new_value, 0, &new_value); + if (OPAL_SUCCESS != ret || new_value == old_value) { break; } - while (!complete) { - ompi_osc_rdma_progress (module); - } - - if (((int64_t *) ptr)[1] == ((int64_t *) ptr)[0]) { - break; - } + old_value = new_value; } while (1); if (result_addr) { - memcpy (result_addr, ptr + 8 + offset, extent); + memcpy (result_addr, (void *)((intptr_t) &new_value) + offset, extent); } - ompi_osc_rdma_frag_complete (frag); + if (OPAL_SUCCESS == ret) { + /* done. release the lock */ + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); + + if (req) { + ompi_osc_rdma_request_complete (req, MPI_SUCCESS); + } + } return ret; } -static void ompi_osc_rdma_acc_single_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; - ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) data; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic accumulate complete"); - - ompi_osc_rdma_sync_rdma_dec (sync); - if (req) { - ompi_osc_rdma_request_complete (req, status); - } -} - static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, ompi_datatype_t *dt, ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, - ompi_op_t *op, ompi_osc_rdma_request_t *req) + ompi_op_t *op, ompi_osc_rdma_request_t *req, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; int32_t atomic_flags = module->selected_btl->btl_atomic_flags; @@ -853,7 +631,8 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { /* btl put atomics not supported or disabled. fall back on fetch-and-op */ - return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, op, req); + return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, + op, req, lock_acquired); } if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || @@ -875,23 +654,15 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate using 64-bit btl atomics. origin: 0x%" PRIx64, *((int64_t *) origin_addr)); - ompi_osc_rdma_sync_rdma_inc (sync); + /* if we locked the peer its best to wait for completion before returning */ + ret = ompi_osc_rdma_btl_op (module, peer->data_endpoint, target_address, target_handle, btl_op, origin, + flags, true, NULL, NULL, NULL); + if (OPAL_SUCCESS == ret) { + /* done. release the lock */ + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); - do { - ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->data_endpoint, target_address, - target_handle, btl_op, origin, flags, MCA_BTL_NO_ORDER, - ompi_osc_rdma_acc_single_atomic_complete, sync, req); - - ompi_osc_rdma_progress (module); - } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); - - if (OPAL_SUCCESS != ret) { - ompi_osc_rdma_sync_rdma_dec (sync); - if (1 == ret) { - if (req) { - ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); - } - ret = OMPI_SUCCESS; + if (req) { + ompi_osc_rdma_request_complete (req, MPI_SUCCESS); } } @@ -902,147 +673,100 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo * ompi_osc_rdma_cas_get_complete: * Note: This function will not work as is in a heterogeneous environment. */ -static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, +static void ompi_osc_rdma_cas_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, void *context, void *data, int status) { - ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context; - ompi_osc_rdma_sync_t *sync = request->sync; - ompi_osc_rdma_module_t *module = sync->module; - intptr_t source = (intptr_t) local_address + request->offset; - ompi_osc_rdma_frag_t *frag = request->frag; - ompi_osc_rdma_peer_t *peer = request->peer; - int ret; + bool *complete = (bool *) context; - OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "remote compare-and-swap get complete on sync %p. " - "status %d", (void *) sync, status); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { - return; - } - - /* copy data to the user buffer (for gacc) */ - memcpy (request->result_addr, (void *) source, request->len); - - if (0 == memcmp ((void *) source, request->compare_addr, request->len)) { - /* the target and compare buffers match. write the source to the target */ - memcpy ((void *) source, request->origin_addr, request->len); - - ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address, - request->target_address, local_handle, - (mca_btl_base_registration_handle_t *) request->ctx, - request->len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_acc_put_complete, request, NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code " - "%d. queuing operation...", ret); - - ret = ompi_osc_rdma_event_queue (module, peer->data_endpoint, OMPI_OSC_RDMA_EVENT_TYPE_PUT, local_address, local_handle, - request->target_address, (mca_btl_base_registration_handle_t *) request->ctx, request->len, - ompi_osc_rdma_acc_put_complete, request, NULL); - } - - return; - } - - /* this is a no-op. nothing more to do except release the accumulate lock */ - ompi_osc_rdma_frag_complete (frag); - - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer, - offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } - - /* the request is now complete and the outstanding rdma operation is complete */ - ompi_osc_rdma_request_complete (request, status); - - ompi_osc_rdma_sync_rdma_dec (sync); - ompi_osc_rdma_peer_clear_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING); + *complete = true; } +/** + * @brief Support for compare-and-swap on arbitraty-sized datatypes + * + * This function is necessary to support compare-and-swap on types larger + * than 64-bits. As of MPI-3.1 this can include MPI_INTEGER16 and possibly + * MPI_LON_LONG_INT. The former is a 128-bit value and the later *may* + * be depending on the platform, compiler, etc. This function currently + * blocks until the operation is complete. + */ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, - mca_btl_base_registration_handle_t *target_handle) + mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); unsigned long offset, aligned_len, len = datatype->super.size; + mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; ompi_osc_rdma_request_t *request; - char *ptr = NULL; + volatile bool complete = false; + /* drop the const. this code will not attempt to change the value */ + char *ptr = (char *) source_addr; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using RMDA on %lu bytes to remote address %" PRIx64 ", sync %p", len, target_address, (void *) sync); - OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request); - - request->internal = true; - request->type = OMPI_OSC_RDMA_TYPE_CSWAP; - request->sync = sync; - - OPAL_THREAD_LOCK(&module->lock); - /* to ensure order wait until the previous accumulate completes */ - while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) { - OPAL_THREAD_UNLOCK(&module->lock); - ompi_osc_rdma_progress (module); - OPAL_THREAD_LOCK(&module->lock); - } - OPAL_THREAD_UNLOCK(&module->lock); - - offset = target_address & btl_alignment_mask;; - aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask; - - do { - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { - break; - } - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not allocate an rdma fragment for compare-and-swap"); - ompi_osc_rdma_progress (module); - } while (1); - - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl get..."); + ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, result_addr, len); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; } - /* set up the request */ - request->frag = frag; - request->origin_addr = (void *) source_addr; - request->ctx = (void *) target_handle; - request->result_addr = result_addr; - request->compare_addr = compare_addr; - request->result_dt = datatype; - request->offset = (ptrdiff_t) offset; - request->target_address = target_address; - request->len = len; + if (0 != memcmp (result_addr, compare_addr, len)) { + /* value does not match compare value, nothing more to do*/ + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); + return OMPI_SUCCESS; + } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating btl get"); - - do { - ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, - target_address, frag->handle, target_handle, - aligned_len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_get_complete, request, NULL); - if (OPAL_LIKELY(OPAL_SUCCESS == ret)) { - break; - } - - if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + if (module->selected_btl->btl_register_mem && len > module->selected_btl->btl_put_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { + break; } - ompi_osc_rdma_frag_complete (frag); - return ret; + + ompi_osc_rdma_progress (module); + } while (1); + + memcpy (ptr, source_addr, len); + local_handle = frag->handle; + } + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); + + do { + ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address, + local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); + if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { + break; } + /* spin a bit on progress */ ompi_osc_rdma_progress (module); } while (1); - ompi_osc_rdma_sync_rdma_inc (sync); + if (OPAL_SUCCESS != ret) { + /* something went horribly wrong */ + return ret; + } - return OMPI_SUCCESS; + while (!complete) { + ompi_osc_rdma_progress (module); + } + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap compare-and-swap complete"); + + if (frag) { + ompi_osc_rdma_frag_complete (frag); + } + + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); + + return ret; } @@ -1056,6 +780,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare ompi_osc_rdma_sync_t *sync; uint64_t target_address; ptrdiff_t true_lb, true_extent; + bool lock_acquired = false; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "cswap: 0x%lx, 0x%lx, 0x%lx, %s, %d, %d, %s", @@ -1077,24 +802,49 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare return ret; } - if (win->w_acc_ops <= OMPI_WIN_ACCUMULATE_OPS_SAME_OP) { - /* the user has indicated that they will only use the same op (or same op and no op) - * for operations on overlapping memory ranges. that indicates it is safe to go ahead - * and use network atomic operations. */ - ret = ompi_osc_rdma_cas_atomic (sync, origin_addr, compare_addr, result_addr, dt, - peer, target_address, target_handle); - if (OMPI_SUCCESS == ret) { - return OMPI_SUCCESS; - } + /* to ensure order wait until the previous accumulate completes */ + while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) { + ompi_osc_rdma_progress (module); + } + + /* get an exclusive lock on the peer */ + if (!ompi_osc_rdma_peer_is_exclusive (peer) && !(module->acc_single_intrinsic || win->w_acc_ops <= OMPI_WIN_ACCUMULATE_OPS_SAME_OP)) { + (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + lock_acquired = true; + } + + /* either we have and exclusive lock (via MPI_Win_lock() or the accumulate lock) or the + * user has indicated that they will only use the same op (or same op and no op) for + * operations on overlapping memory ranges. that indicates it is safe to go ahead and + * use network atomic operations. */ + ret = ompi_osc_rdma_cas_atomic (sync, origin_addr, compare_addr, result_addr, dt, + peer, target_address, target_handle, lock_acquired); + if (OMPI_SUCCESS == ret) { + return OMPI_SUCCESS; + } + + if (!(lock_acquired || ompi_osc_rdma_peer_is_exclusive (peer))) { + (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + lock_acquired = true; } if (ompi_osc_rdma_peer_local_base (peer)) { - return ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt, - peer, target_address, target_handle, module); + ret = ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt, + peer, target_address, target_handle, module, + lock_acquired); + } else { + ret = cas_rdma (sync, origin_addr, compare_addr, result_addr, dt, peer, target_address, + target_handle, lock_acquired); } - return cas_rdma (sync, origin_addr, compare_addr, result_addr, dt, peer, target_address, - target_handle); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + /* operation failed. the application will most likely abort but we still want to leave the window + * in working state if possible. on successful completion the above calls with clear the lock + * and accumulate state */ + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); + } + + return ret; } @@ -1110,6 +860,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; ptrdiff_t lb, origin_extent, target_span; + bool lock_acquired = false; int ret; /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ @@ -1132,14 +883,28 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo (void) ompi_datatype_get_extent (origin_datatype, &lb, &origin_extent); - if (module->acc_single_intrinsic && origin_extent <= 8) { + /* to ensure order wait until the previous accumulate completes */ + while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) { + ompi_osc_rdma_progress (module); + } + + /* get an exclusive lock on the peer if needed */ + if (!ompi_osc_rdma_peer_is_exclusive (peer) && !module->acc_single_intrinsic) { + lock_acquired = true; + (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } + + /* if the datatype is small enough (and the count is 1) then try to directly use the hardware to execute + * the atomic operation. this should be safe in all cases as either 1) the user has assured us they will + * never use atomics with count > 1, 2) we have the accumulate lock, or 3) we have an exclusive lock */ + if (origin_extent <= 8 && 1 == origin_count) { if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) { if (NULL == result_addr) { ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, origin_extent, peer, target_address, - target_handle, op, request); + target_handle, op, request, lock_acquired); } else { ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address, - target_handle, op, request); + target_handle, op, request, lock_acquired); } if (OMPI_SUCCESS == ret) { @@ -1148,22 +913,36 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo } ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address, - target_handle, op, request); + target_handle, op, request, lock_acquired); if (OMPI_SUCCESS == ret) { return OMPI_SUCCESS; } } - if (ompi_osc_rdma_peer_local_base (peer)) { - /* local/self optimization */ - return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count, - result_datatype, peer, target_address, target_handle, target_count, - target_datatype, op, module, request); + /* could not use network atomics. acquire the lock if needed and continue. */ + if (!lock_acquired && !ompi_osc_rdma_peer_is_exclusive (peer)) { + lock_acquired = true; + (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } - return ompi_osc_rdma_gacc_master (sync, origin_addr, origin_count, origin_datatype, result_addr, result_count, - result_datatype, peer, target_address, target_handle, target_count, - target_datatype, op, request); + if (ompi_osc_rdma_peer_local_base (peer)) { + /* local/self optimization */ + ret = ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count, + result_datatype, peer, target_address, target_handle, target_count, + target_datatype, op, module, request, lock_acquired); + } else { + /* do not need to pass the lock acquired flag to this function. the value of the flag can be obtained + * just by calling ompi_osc_rdma_peer_is_exclusive() in this case. */ + ret = ompi_osc_rdma_gacc_master (sync, origin_addr, origin_count, origin_datatype, result_addr, result_count, + result_datatype, peer, target_address, target_handle, target_count, + target_datatype, op, request); + } + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); + } + + return ret; } int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c index b045ebf3ec..b4fb3dec64 100644 --- a/ompi/mca/osc/rdma/osc_rdma_active_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -8,7 +8,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. @@ -55,6 +55,7 @@ static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pend pending_op->op_buffer = NULL; pending_op->op_result = NULL; pending_op->op_complete = false; + pending_op->cbfunc = NULL; } static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op) @@ -79,10 +80,16 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b { ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", pending_op, status); + if (pending_op->op_result) { memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); } + if (NULL != pending_op->cbfunc) { + pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status); + } + if (NULL != pending_op->op_frag) { ompi_osc_rdma_frag_complete (pending_op->op_frag); pending_op->op_frag = NULL; @@ -194,7 +201,8 @@ static void ompi_osc_rdma_handle_post (ompi_osc_rdma_module_t *module, int rank, if (rank == peers[j]->rank) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "got expected post from %d. still expecting posts from %d processes", rank, (int) (npeers - state->num_post_msgs - 1)); - ++state->num_post_msgs; + /* an atomic is not really necessary as this function is currently used but it doesn't hurt */ + ompi_osc_rdma_counter_add (&state->num_post_msgs, 1); return; } } @@ -206,13 +214,91 @@ static void ompi_osc_rdma_handle_post (ompi_osc_rdma_module_t *module, int rank, OPAL_THREAD_SCOPED_LOCK(&module->lock, opal_list_append (&module->pending_posts, &pending_post->super)); } +static void ompi_osc_rdma_check_posts (ompi_osc_rdma_module_t *module) +{ + ompi_osc_rdma_state_t *state = module->state; + ompi_osc_rdma_sync_t *sync = &module->all_sync; + int count = 0; + + if (OMPI_OSC_RDMA_SYNC_TYPE_PSCW == sync->type) { + count = sync->num_peers; + } + + for (int i = 0 ; i < OMPI_OSC_RDMA_POST_PEER_MAX ; ++i) { + /* no post at this index (yet) */ + if (0 == state->post_peers[i]) { + continue; + } + + ompi_osc_rdma_handle_post (module, state->post_peers[i] - 1, sync->peer_list.peers, count); + state->post_peers[i] = 0; + } +} + +static int ompi_osc_rdma_post_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer) +{ + uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index); + ompi_osc_rdma_lock_t post_index, result, _tmp_value; + int my_rank = ompi_comm_rank (module->comm); + int ret; + + if (peer->rank == my_rank) { + ompi_osc_rdma_handle_post (module, my_rank, NULL, 0); + return OMPI_SUCCESS; + } + + /* get a post index */ + if (!ompi_osc_rdma_peer_local_state (peer)) { + ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } else { + post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1; + } + + post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1; + + target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) + + sizeof (osc_rdma_counter_t) * post_index; + + do { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", (int)post_index, peer->rank); + + _tmp_value = 0; + + /* try to post. if the value isn't 0 then another rank is occupying this index */ + if (!ompi_osc_rdma_peer_local_state (peer)) { + ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } else { + result = !ompi_osc_rdma_lock_compare_exchange ((osc_rdma_counter_t *) target, &_tmp_value, + 1 + (osc_rdma_counter_t) my_rank); + } + + if (OPAL_LIKELY(0 == result)) { + break; + } + + /* prevent circular wait by checking for post messages received */ + ompi_osc_rdma_check_posts (module); + + /* zzzzzzzzzzzzz */ + nanosleep (&(struct timespec) {.tv_sec = 0, .tv_nsec = 100}, NULL); + } while (1); + + return OMPI_SUCCESS; +} + int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t **peers; int my_rank = ompi_comm_rank (module->comm); ompi_osc_rdma_state_t *state = module->state; - int ret; + int ret = OMPI_SUCCESS; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post: %p, %d, %s", (void*) group, assert, win->w_name); @@ -253,67 +339,17 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) /* send a hello counter to everyone in group */ for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) { - ompi_osc_rdma_peer_t *peer = peers[i]; - uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index); - ompi_osc_rdma_lock_t post_index; - - if (peer->rank == my_rank) { - ompi_osc_rdma_handle_post (module, my_rank, NULL, 0); - continue; + ret = ompi_osc_rdma_post_peer (module, peers[i]); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + break; } - - /* get a post index */ - if (!ompi_osc_rdma_peer_local_state (peer)) { - ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true); - assert (OMPI_SUCCESS == ret); - } else { - post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1; - } - - post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1; - - target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) + - sizeof (osc_rdma_counter_t) * post_index; - - do { - ompi_osc_rdma_lock_t result; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", (int)post_index, peer->rank); - - /* try to post. if the value isn't 0 then another rank is occupying this index */ - if (!ompi_osc_rdma_peer_local_state (peer)) { - ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result); - assert (OMPI_SUCCESS == ret); - } else { - ompi_osc_rdma_lock_t _tmp_value = 0; - - result = !ompi_osc_rdma_lock_compare_exchange ((osc_rdma_counter_t *) target, &_tmp_value, 1 + (osc_rdma_counter_t) my_rank); - } - - if (OPAL_LIKELY(0 == result)) { - break; - } - - /* prevent circular wait by checking for post messages received */ - for (int j = 0 ; j < OMPI_OSC_RDMA_POST_PEER_MAX ; ++j) { - /* no post at this index (yet) */ - if (0 == state->post_peers[j]) { - continue; - } - - ompi_osc_rdma_handle_post (module, state->post_peers[j] - 1, NULL, 0); - state->post_peers[j] = 0; - } - - usleep (100); - } while (1); } ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group)); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post complete"); - return OMPI_SUCCESS; + return ret; } int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win) @@ -379,8 +415,7 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win "from %d processes", peer->rank, (int) (group_size - state->num_post_msgs - 1)); opal_list_remove_item (&module->pending_posts, &pending_post->super); OBJ_RELEASE(pending_post); - /* only one thread can process post messages so there is no need of atomics here */ - ++state->num_post_msgs; + ompi_osc_rdma_counter_add (&state->num_post_msgs, 1); break; } } @@ -390,16 +425,7 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win while (state->num_post_msgs != group_size) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "waiting for post messages. have %d of %d", (int) state->num_post_msgs, group_size); - for (int i = 0 ; i < OMPI_OSC_RDMA_POST_PEER_MAX ; ++i) { - /* no post at this index (yet) */ - if (0 == state->post_peers[i]) { - continue; - } - - ompi_osc_rdma_handle_post (module, state->post_peers[i] - 1, sync->peer_list.peers, group_size); - state->post_peers[i] = 0; - } - + ompi_osc_rdma_check_posts (module); ompi_osc_rdma_progress (module); } } else { @@ -500,7 +526,6 @@ int ompi_osc_rdma_wait_atomic (ompi_win_t *win) } OPAL_THREAD_LOCK(&module->lock); - state->num_complete_msgs = 0; group = module->pw_group; module->pw_group = NULL; OPAL_THREAD_UNLOCK(&module->lock); @@ -551,6 +576,8 @@ int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag) OBJ_RELEASE(group); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "test complete. returning flag: true"); + return OMPI_SUCCESS; } @@ -567,6 +594,8 @@ int ompi_osc_rdma_fence_atomic (int assert, ompi_win_t *win) return OMPI_ERR_RMA_SYNC; } + /* NTH: locking here isn't really needed per-se but it may make user synchronization errors more + * predicable. if the user is using RMA correctly then there will be no contention on this lock. */ OPAL_THREAD_LOCK(&module->lock); /* active sends are now active (we will close the epoch if NOSUCCEED is specified) */ @@ -578,22 +607,17 @@ int ompi_osc_rdma_fence_atomic (int assert, ompi_win_t *win) } /* technically it is possible to enter a lock epoch (which will close the fence epoch) if - * no communication has occurred. this flag will be set on the next put, get, accumulate, etc. */ + * no communication has occurred. this flag will be set to true on the next put, get, + * accumulate, etc if no other synchronization call is made. yay fence */ module->all_sync.epoch_active = false; - /* short-circuit the noprecede case */ - if (0 != (assert & MPI_MODE_NOPRECEDE)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fence complete (short circuit)"); - /* no communication can occur until a peer has entered the same fence epoch. for now - * a barrier is used to ensure this is the case. */ - ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); - OPAL_THREAD_UNLOCK(&module->lock); - return ret; - } + /* there really is no practical difference between NOPRECEDE and the normal case. in both cases there + * may be local stores that will not be visible as they should if we do not barrier. since that is the + * case there is no optimization for NOPRECEDE */ ompi_osc_rdma_sync_rdma_complete (&module->all_sync); - /* ensure all writes to my memory are complete */ + /* ensure all writes to my memory are complete (both local stores, and RMA operations) */ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); if (assert & MPI_MODE_NOSUCCEED) { diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 3357d1049c..0d506374c9 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2016 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science @@ -21,6 +21,27 @@ #include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "opal/align.h" +/* helper functions */ +static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, bool dec_always, ompi_osc_rdma_frag_t *frag, + mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request) +{ + if (frag) { + ompi_osc_rdma_frag_complete (frag); + } else { + ompi_osc_rdma_deregister (sync->module, handle); + } + + if (request) { + (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1); + } + + if (dec_always) { + ompi_osc_rdma_sync_rdma_dec_always (sync); + } else { + ompi_osc_rdma_sync_rdma_dec (sync); + } +} + static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size, ompi_osc_rdma_request_t *request); @@ -37,17 +58,30 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; volatile bool read_complete = false; + size_t aligned_len, offset; + uint64_t aligned_addr = (source_address + btl_alignment_mask) & ~btl_alignment_mask; char *ptr = data; int ret; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "reading state data from endpoint %p. source: 0x%" PRIx64 ", len: %lu", - (void *) endpoint, source_address, (unsigned long) len); + offset = source_address & btl_alignment_mask; + aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "reading data from endpoint %p. source: 0x%" PRIx64 " (aligned: 0x%" PRIx64 + "), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len, + (unsigned long) aligned_len); if (module->selected_btl->btl_register_mem && len >= module->selected_btl->btl_get_local_registration_threshold) { - ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr); + do { + ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { + ompi_osc_rdma_progress (module); + } + } while (OMPI_ERR_OUT_OF_RESOURCE == ret); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); return ret; @@ -61,10 +95,10 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b assert (!(source_address & ALIGNMENT_MASK(module->selected_btl->btl_get_alignment))); do { - ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, source_address, - local_handle, source_handle, len, 0, MCA_BTL_NO_ORDER, + ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, aligned_addr, + local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_get_data_complete, (void *) &read_complete, NULL); - if (OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) { + if (!ompi_osc_rdma_oor (ret)) { break; } @@ -91,7 +125,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b opal_memchecker_base_mem_defined (ptr, len); if (frag) { - memcpy (data, ptr, len); + memcpy (data, ptr + offset, len); /* done with the fragment */ ompi_osc_rdma_frag_complete (frag); @@ -191,7 +225,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc remote_iov_count = OMPI_OSC_RDMA_DECODE_MAX; remote_iov_index = 0; - /* opal_convertor_raw returns done when it has reached the end of the data */ + /* opal_convertor_raw returns true when it has reached the end of the data */ done = opal_convertor_raw (&remote_convertor, remote_iovec, &remote_iov_count, &remote_size); /* loop on the target segments until we have exhaused the decoded source data */ @@ -232,7 +266,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) { if (request) { - (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1); + ompi_osc_rdma_request_deref (request); } if (alloc_reqs) { @@ -262,11 +296,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc if (request) { /* release our reference so the request can complete */ - if (1 == request->outstanding_requests) { - ompi_osc_rdma_request_complete (request, OMPI_SUCCESS); - } - - (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1); + ompi_osc_rdma_request_deref (request); } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished scheduling rdma on non-contiguous datatype(s)"); @@ -353,14 +383,12 @@ static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struc void *context, void *data, int status) { ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; - ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; - ompi_osc_rdma_request_t *request = NULL; assert (OPAL_SUCCESS == status); /* the lowest bit is used as a flag indicating this put operation has a request */ if ((intptr_t) context & 0x1) { - request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); sync = request->sync; /* NTH -- TODO: better error handling */ @@ -370,15 +398,42 @@ static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struc OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on sync %p. local " "address %p. opal status %d", (void *) sync, local_address, status); - if (frag) { - ompi_osc_rdma_frag_complete (frag); - } else { + if (data) { + ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data); + } else if (local_handle) { ompi_osc_rdma_deregister (sync->module, local_handle); } ompi_osc_rdma_sync_rdma_dec (sync); } +static void ompi_osc_rdma_put_complete_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) context; + + assert (OPAL_SUCCESS == status); + + /* the lowest bit is used as a flag indicating this put operation has a request */ + if ((intptr_t) context & 0x1) { + ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + module = request->module; + + /* NTH -- TODO: better error handling */ + ompi_osc_rdma_request_complete (request, status); + } + + OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on module %p. local " + "address %p. opal status %d", (void *) module, local_address, status); + + if (data) { + ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data); + } else if (local_handle) { + ompi_osc_rdma_deregister (module, local_handle); + } +} + static void ompi_osc_rdma_aggregate_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, void *context, void *data, int status) @@ -424,14 +479,12 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee ++module->put_retry_count; - if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) { + if (!ompi_osc_rdma_oor (ret)) { break; } /* spin a bit on progress */ - for (int i = 0 ; i < 10 ; ++i) { - ompi_osc_rdma_progress (module); - } + ompi_osc_rdma_progress (module); } while (1); OSC_RDMA_VERBOSE(10, "btl put failed with opal error code %d", ret); @@ -498,18 +551,20 @@ static int ompi_osc_rdma_aggregate_alloc (ompi_osc_rdma_sync_t *sync, ompi_osc_r return OMPI_SUCCESS; } -static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address, - mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size, - ompi_osc_rdma_request_t *request) +int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size, + ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate; mca_btl_base_registration_handle_t *local_handle = NULL; + mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; char *ptr = source_buffer; void *cbcontext; int ret; +#if 0 if (aggregation) { if (size <= (aggregation->buffer_size - aggregation->buffer_used) && (target_handle == aggregation->target_handle) && (target_address == aggregation->target_address + aggregation->buffer_used)) { @@ -535,6 +590,7 @@ static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p return ret; } } +#endif if (module->selected_btl->btl_register_mem && size > module->selected_btl->btl_put_local_registration_threshold) { ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr); @@ -549,23 +605,36 @@ static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } } + if (ompi_osc_rdma_use_btl_flush (module)) { + /* NTH: when using the btl_flush function there is no guarantee that the callback will happen + * before the flush is complete. because of this there is a chance that the sync object will be + * released before there is a callback. to handle this case we call different callback that doesn't + * use the sync object. its possible the btl sematics will change in the future and the callback + * will happen *before* flush is considered complete. if that is the case this workaround can be + * removed */ + cbcontext = (void *) module; + if (request || local_handle || frag) { + cbfunc = ompi_osc_rdma_put_complete_flush; + } + /* else the callback function is a no-op so do not bother specifying one */ + } else { + cbcontext = (void *) sync; + cbfunc = ompi_osc_rdma_put_complete; + } + /* increment the outstanding request counter in the request object */ if (request) { (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1); cbcontext = (void *) ((intptr_t) request | 1); request->sync = sync; - } else { - cbcontext = (void *) sync; } - ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, ompi_osc_rdma_put_complete, + ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, cbfunc, cbcontext, frag); - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { - return OMPI_SUCCESS; + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + ompi_osc_rdma_cleanup_rdma (sync, false, frag, local_handle, request); } - ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request); - return ret; } @@ -584,20 +653,26 @@ static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struc assert (OPAL_SUCCESS == status); - if (request->buffer || NULL != frag) { + if (request->buffer || frag) { if (OPAL_LIKELY(OMPI_SUCCESS == status)) { memcpy (origin_addr, (void *) source, request->len); } } + if (NULL == request->buffer) { + /* completion detection can handle this case without the counter when using btl_flush */ + ompi_osc_rdma_sync_rdma_dec (sync); + } else { + /* the counter was needed to keep track of the number of outstanding operations */ + ompi_osc_rdma_sync_rdma_dec_always (sync); + } + if (NULL != frag) { ompi_osc_rdma_frag_complete (frag); } else { ompi_osc_rdma_deregister (sync->module, local_handle); } - ompi_osc_rdma_sync_rdma_dec (sync); - ompi_osc_rdma_request_complete (request, status); } @@ -624,7 +699,7 @@ int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer) return OMPI_SUCCESS; } - ompi_osc_rdma_cleanup_rdma (aggregation->sync, aggregation->frag, NULL, NULL); + ompi_osc_rdma_cleanup_rdma (aggregation->sync, false, aggregation->frag, NULL, NULL); ompi_osc_rdma_aggregation_return (aggregation); @@ -648,7 +723,7 @@ static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_ ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OMPI_OSC_RDMA_REQUEST_RETURN(subreq); - (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1); + ompi_osc_rdma_request_deref (request); } return ret; @@ -665,6 +740,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p osc_rdma_size_t aligned_len; osc_rdma_base_t aligned_source_base, aligned_source_bound; char *ptr = target_buffer; + bool counter_needs_inc = false; int ret; aligned_source_base = source_address & ~btl_alignment_mask; @@ -746,19 +822,31 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p request->origin_addr = target_buffer; request->sync = sync; - ompi_osc_rdma_sync_rdma_inc (sync); + if (request->buffer) { + /* always increment the outstanding RDMA counter as the btl_flush function does not guarantee callback completion, + * just operation completion. */ + counter_needs_inc = true; + ompi_osc_rdma_sync_rdma_inc_always (sync); + } else { + /* if this operation is being buffered with a frag then ompi_osc_rdma_sync_rdma_complete() can use the number + * of pending operations on the rdma_frag as an indicator as to whether the operation is complete. this can + * only be done since there is only on rdma frag per module. if that changes this logic will need to be changed + * as well. this path also covers the case where the get operation is not buffered. */ + ompi_osc_rdma_sync_rdma_inc (sync); + } do { - ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, aligned_source_base, local_handle, - source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete, + ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, + aligned_source_base, local_handle, source_handle, + aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete, request, frag); - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } ++module->get_retry_count; - if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) { + if (!ompi_osc_rdma_oor (ret)) { break; } @@ -770,7 +858,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "btl get failed with opal error code %d", ret); - ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request); + ompi_osc_rdma_cleanup_rdma (sync, counter_needs_inc, frag, local_handle, request); return ret; } diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index e6d6950575..0f3d9f19c5 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -24,23 +24,6 @@ #define min(a,b) ((a) < (b) ? (a) : (b)) #define ALIGNMENT_MASK(x) ((x) ? (x) - 1 : 0) -/* helper functions */ -static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_frag_t *frag, - mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request) -{ - if (frag) { - ompi_osc_rdma_frag_complete (frag); - } else { - ompi_osc_rdma_deregister (sync->module, handle); - } - - if (request) { - (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1); - } - - ompi_osc_rdma_sync_rdma_dec (sync); -} - /** * @brief find a remote segment associate with the memory region * @@ -134,4 +117,8 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len); +int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size, + ompi_osc_rdma_request_t *request); + #endif /* OMPI_OSC_RDMA_COMM_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index a7698522f4..9dc09efe4b 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. @@ -78,6 +78,12 @@ static char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, char *ke static char *ompi_osc_rdma_btl_names; static char *ompi_osc_rdma_mtl_names; +static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { + {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"}, + {.value = OMPI_OSC_RDMA_LOCKING_ON_DEMAND, .string = "on_demand"}, + {.string = NULL}, +}; + ompi_osc_rdma_component_t mca_osc_rdma_component = { .super = { .osc_version = { @@ -171,88 +177,97 @@ static int ompi_osc_rdma_pvar_read (const struct mca_base_pvar_t *pvar, void *va static int ompi_osc_rdma_component_register (void) { char *description_str; + mca_base_var_enum_t *new_enum; + mca_osc_rdma_component.no_locks = false; asprintf(&description_str, "Enable optimizations available only if MPI_LOCK is " - "not used. Info key of same name overrides this value (default: %s)", - mca_osc_rdma_component.no_locks ? "true" : "false"); - (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, - "no_locks", description_str, + "not used. Info key of same name overrides this value (default: %s)", + mca_osc_rdma_component.no_locks ? "true" : "false"); + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "no_locks", description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks); free(description_str); mca_osc_rdma_component.acc_single_intrinsic = false; asprintf(&description_str, "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes " - "that will not use anything more than a single predefined datatype (default: %s)", - mca_osc_rdma_component.acc_single_intrinsic ? "true" : "false"); + "that will not use anything more than a single predefined datatype (default: %s)", + mca_osc_rdma_component.acc_single_intrinsic ? "true" : "false"); (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic", - description_str, - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic); free(description_str); mca_osc_rdma_component.acc_use_amo = true; asprintf(&description_str, "Enable the use of network atomic memory operations when using single " - "intrinsic optimizations. If not set network compare-and-swap will be " - "used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false"); - (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", - description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_use_amo); + "intrinsic optimizations. If not set network compare-and-swap will be " + "used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false"); + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", description_str, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, + &mca_osc_rdma_component.acc_use_amo); free(description_str); mca_osc_rdma_component.buffer_size = 32768; asprintf(&description_str, "Size of temporary buffers (default: %d)", mca_osc_rdma_component.buffer_size); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", - description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, - NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, - &mca_osc_rdma_component.buffer_size); + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", description_str, + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size); free(description_str); mca_osc_rdma_component.max_attach = 32; asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. " - "Keep in mind that each attached buffer will use a potentially limited " - "resource (default: %d)", mca_osc_rdma_component.max_attach); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", - description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, - OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach); + "Keep in mind that each attached buffer will use a potentially limited " + "resource (default: %d)", mca_osc_rdma_component.max_attach); + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str, + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach); free(description_str); mca_osc_rdma_component.aggregation_limit = 1024; asprintf(&description_str, "Maximum size of an aggregated put/get. Messages are aggregated for consecutive" - "put and get operations. In some cases this may lead to higher latency but " - "should also lead to higher bandwidth utilization. Set to 0 to disable (default: %d)", - mca_osc_rdma_component.aggregation_limit); + "put and get operations. In some cases this may lead to higher latency but " + "should also lead to higher bandwidth utilization. Set to 0 to disable (default: %d)", + mca_osc_rdma_component.aggregation_limit); (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "aggregation_limit", description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.aggregation_limit); free(description_str); - mca_osc_rdma_component.priority = 90; + mca_osc_rdma_component.priority = 101; asprintf(&description_str, "Priority of the osc/rdma component (default: %d)", - mca_osc_rdma_component.priority); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", - description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3, + mca_osc_rdma_component.priority); + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", description_str, + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.priority); free(description_str); - ompi_osc_rdma_btl_names = "openib,ugni"; + (void) mca_base_var_enum_create ("osc_rdma_locking_mode", ompi_osc_rdma_locking_modes, &new_enum); + + mca_osc_rdma_component.locking_mode = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL; + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "locking_mode", + "Locking mode to use for passive-target synchronization (default: two_level)", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.locking_mode); + OBJ_RELEASE(new_enum); + + ompi_osc_rdma_btl_names = "openib,ugni,uct,ucp"; asprintf(&description_str, "Comma-delimited list of BTL component names to allow without verifying " - "connectivity. Do not add a BTL to to this list unless it can reach all " - "processes in any communicator used with an MPI window (default: %s)", - ompi_osc_rdma_btl_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls", - description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, + "connectivity. Do not add a BTL to to this list unless it can reach all " + "processes in any communicator used with an MPI window (default: %s)", + ompi_osc_rdma_btl_names); + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls", description_str, + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names); free(description_str); ompi_osc_rdma_mtl_names = "psm2"; asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma " - "osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls", - description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, + "osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names); + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls", description_str, + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_mtl_names); free(description_str); + /* register performance variables */ (void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "put_retry_count", @@ -481,6 +496,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s return ret; } + module->my_peer = my_peer; module->free_after = module->rank_array; my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; my_peer->state = (uint64_t) (uintptr_t) module->state; @@ -503,8 +519,13 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = size; } - if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { - ex_peer->super.base_handle = module->state_handle; + if (!module->use_cpu_atomics) { + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + /* base is local and cpu atomics are available */ + ex_peer->super.base_handle = module->state_handle; + } else { + ex_peer->super.base_handle = module->base_handle; + } } } @@ -701,6 +722,10 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, temp[0].rank); } + if (my_rank == peer_rank) { + module->my_peer = peer; + } + if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor || MPI_WIN_FLAVOR_CREATE == module->flavor) { /* use the peer's BTL endpoint directly */ peer->data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank); @@ -745,20 +770,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s static int ompi_osc_rdma_query_mtls (void) { char **mtls_to_use; - bool mtl_match = false; mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ','); if (mtls_to_use && ompi_mtl_base_selected_component) { - for (int i = 0 ; mtls_to_use[i] ; ++i) { - if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) { - mtl_match = true; - break; - } - } + for (int i = 0 ; mtls_to_use[i] ; ++i) { + if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) { + return OMPI_SUCCESS; + } + } } - - opal_argv_free (mtls_to_use); - return mtl_match ? OMPI_SUCCESS : OMPI_ERR_NOT_FOUND; + return -1; } static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl) @@ -1115,7 +1136,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->same_disp_unit = check_config_value_bool ("same_disp_unit", info); module->same_size = check_config_value_bool ("same_size", info); module->no_locks = check_config_value_bool ("no_locks", info); - module->acc_single_intrinsic = check_config_value_bool ("ompi_single_accumulate", info); + module->locking_mode = mca_osc_rdma_component.locking_mode; + module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info); module->acc_use_amo = mca_osc_rdma_component.acc_use_amo; module->all_sync.module = module; diff --git a/ompi/mca/osc/rdma/osc_rdma_frag.h b/ompi/mca/osc/rdma/osc_rdma_frag.h index 610ce44700..0ccd064499 100644 --- a/ompi/mca/osc/rdma/osc_rdma_frag.h +++ b/ompi/mca/osc/rdma/osc_rdma_frag.h @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -16,34 +16,14 @@ #include "osc_rdma.h" #include "opal/align.h" -/** Communication buffer for packing messages */ -struct ompi_osc_rdma_frag_t { - opal_free_list_item_t super; - - /* start of unused space */ - unsigned char *top; - - /* space remaining in buffer */ - uint32_t remain_len; - /* Number of operations which have started writing into the frag, but not yet completed doing so */ - int32_t pending; - - ompi_osc_rdma_module_t *module; - mca_btl_base_registration_handle_t *handle; -}; -typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t; -OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t); - - static inline void ompi_osc_rdma_frag_complete (ompi_osc_rdma_frag_t *frag) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "returning frag. pending = %d", frag->pending); if (0 == OPAL_THREAD_ADD_FETCH32(&frag->pending, -1)) { opal_atomic_rmb (); - ompi_osc_rdma_deregister (frag->module, frag->handle); - frag->handle = NULL; - - opal_free_list_return (&mca_osc_rdma_component.frags, (opal_free_list_item_t *) frag); + (void) opal_atomic_swap_32 (&frag->pending, 1); + (void) opal_atomic_swap_64 (&frag->curr_index, 0); } } @@ -53,7 +33,8 @@ static inline void ompi_osc_rdma_frag_complete (ompi_osc_rdma_frag_t *frag) static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size_t request_len, ompi_osc_rdma_frag_t **buffer, char **ptr) { - ompi_osc_rdma_frag_t *curr; + ompi_osc_rdma_frag_t *curr = module->rdma_frag; + int64_t my_index; int ret; /* ensure all buffers are 8-byte aligned */ @@ -63,59 +44,55 @@ static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size return OMPI_ERR_VALUE_OUT_OF_BOUNDS; } - OPAL_THREAD_LOCK(&module->lock); - curr = module->rdma_frag; - if (OPAL_UNLIKELY(NULL == curr || curr->remain_len < request_len)) { - if (NULL == curr || (NULL != curr && curr->pending > 1)) { - opal_free_list_item_t *item = NULL; + if (NULL == curr) { + opal_free_list_item_t *item = NULL; - /* release the initial reference to the buffer */ - module->rdma_frag = NULL; + item = opal_free_list_get (&mca_osc_rdma_component.frags); + if (OPAL_UNLIKELY(NULL == item)) { + OPAL_THREAD_UNLOCK(&module->lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } - if (curr) { - ompi_osc_rdma_frag_complete (curr); - } + curr = (ompi_osc_rdma_frag_t *) item; - item = opal_free_list_get (&mca_osc_rdma_component.frags); - if (OPAL_UNLIKELY(NULL == item)) { - OPAL_THREAD_UNLOCK(&module->lock); + curr->handle = NULL; + curr->pending = 1; + curr->module = module; + curr->curr_index = 0; + + if (module->selected_btl->btl_register_mem) { + ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, curr->super.ptr, mca_osc_rdma_component.buffer_size, + MCA_BTL_REG_FLAG_ACCESS_ANY, &curr->handle); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return OMPI_ERR_OUT_OF_RESOURCE; } + } - curr = module->rdma_frag = (ompi_osc_rdma_frag_t *) item; - + if (!opal_atomic_compare_exchange_strong_ptr (&module->rdma_frag, &(void *){NULL}, curr)) { + ompi_osc_rdma_deregister (module, curr->handle); curr->handle = NULL; - curr->pending = 1; - curr->module = module; - } - curr->top = curr->super.ptr; - curr->remain_len = mca_osc_rdma_component.buffer_size; + opal_free_list_return (&mca_osc_rdma_component.frags, &curr->super); - if (curr->remain_len < request_len) { - OPAL_THREAD_UNLOCK(&module->lock); - return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + curr = module->rdma_frag; } } - if (!curr->handle && module->selected_btl->btl_register_mem) { - ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, curr->super.ptr, mca_osc_rdma_component.buffer_size, - MCA_BTL_REG_FLAG_ACCESS_ANY, &curr->handle); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OPAL_THREAD_UNLOCK(&module->lock); - return ret; - } - } - - - *ptr = (char *) curr->top; - *buffer = curr; - - curr->top += request_len; - curr->remain_len -= request_len; + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "allocating frag. pending = %d", curr->pending); OPAL_THREAD_ADD_FETCH32(&curr->pending, 1); - OPAL_THREAD_UNLOCK(&module->lock); + my_index = opal_atomic_fetch_add_64 (&curr->curr_index, request_len); + if (my_index + request_len > mca_osc_rdma_component.buffer_size) { + if (my_index <= mca_osc_rdma_component.buffer_size) { + /* this thread caused the buffer to spill over */ + ompi_osc_rdma_frag_complete (curr); + } + ompi_osc_rdma_frag_complete (curr); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + *ptr = (void *) ((intptr_t) curr->super.ptr + my_index); + *buffer = curr; return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 8c35018bad..95e4cb12f5 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -34,9 +34,10 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b void *context, void *data, int status); __opal_attribute_always_inline__ -static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, - int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, - const bool wait_for_completion) +static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, + uint64_t address, mca_btl_base_registration_handle_t *address_handle, int op, + int64_t operand, int flags, int64_t *result, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) { ompi_osc_rdma_pending_op_t *pending_op; int ret; @@ -49,8 +50,13 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om } pending_op->op_result = (void *) result; - pending_op->op_size = sizeof (ompi_osc_rdma_lock_t); + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } /* spin until the btl has accepted the operation */ do { @@ -59,9 +65,9 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om } if (NULL != pending_op->op_frag) { - ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, pending_op->op_buffer, - (intptr_t) address, pending_op->op_frag->handle, peer->state_handle, - op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + ret = module->selected_btl->btl_atomic_fop (module->selected_btl, endpoint, pending_op->op_buffer, + (intptr_t) address, pending_op->op_frag->handle, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); } @@ -71,10 +77,78 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om ompi_osc_rdma_progress (module); } while (1); + if (OPAL_SUCCESS != ret) { + if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; + ret = OMPI_SUCCESS; + ompi_osc_rdma_atomic_complete (module->selected_btl, endpoint, pending_op->op_buffer, + pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); + } + + /* need to release here because ompi_osc_rdma_atomic_complet was not called */ + OBJ_RELEASE(pending_op); + } else if (wait_for_completion) { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + +__opal_attribute_always_inline__ +static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, + int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, + const bool wait_for_completion) +{ + return ompi_osc_rdma_btl_fop (module, peer->state_endpoint, address, peer->state_handle, op, operand, 0, result, + wait_for_completion, NULL, NULL, NULL); +} + +__opal_attribute_always_inline__ +static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, + uint64_t address, mca_btl_base_registration_handle_t *address_handle, + int op, int64_t operand, int flags, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) +{ + ompi_osc_rdma_pending_op_t *pending_op; + int ret; + + if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + return ompi_osc_rdma_btl_fop (module, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion, + cbfunc, cbdata, cbcontext); + } + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } + + /* spin until the btl has accepted the operation */ + do { + ret = module->selected_btl->btl_atomic_op (module->selected_btl, endpoint, (intptr_t) address, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + if (OPAL_SUCCESS != ret) { /* need to release here because ompi_osc_rdma_atomic_complet was not called */ OBJ_RELEASE(pending_op); if (OPAL_LIKELY(1 == ret)) { + if (cbfunc) { + cbfunc (cbdata, cbcontext, OMPI_SUCCESS); + } ret = OMPI_SUCCESS; } } else if (wait_for_completion) { @@ -91,23 +165,38 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion) +{ + return ompi_osc_rdma_btl_op (module, peer->state_endpoint, address, peer->state_handle, op, operand, 0, wait_for_completion, + NULL, NULL, NULL); +} + +__opal_attribute_always_inline__ +static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, + uint64_t address, mca_btl_base_registration_handle_t *address_handle, + int64_t compare, int64_t value, int flags, int64_t *result) { ompi_osc_rdma_pending_op_t *pending_op; int ret; - if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - return ompi_osc_rdma_lock_btl_fop (module, peer, address, op, operand, NULL, wait_for_completion); - } - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); assert (NULL != pending_op); + OBJ_RETAIN(pending_op); + pending_op->op_result = (void *) result; + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + /* spin until the btl has accepted the operation */ do { - ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, (intptr_t) address, peer->state_handle, - op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); + } + if (NULL != pending_op->op_frag) { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, endpoint, pending_op->op_buffer, + address, pending_op->op_frag->handle, address_handle, compare, + value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, + NULL); + } if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { break; @@ -116,12 +205,14 @@ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, omp } while (1); if (OPAL_SUCCESS != ret) { - /* need to release here because ompi_osc_rdma_atomic_complet was not called */ - OBJ_RELEASE(pending_op); if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; ret = OMPI_SUCCESS; } - } else if (wait_for_completion) { + + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + } else { while (!pending_op->op_complete) { ompi_osc_rdma_progress (module); } @@ -136,49 +227,7 @@ __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result) { - ompi_osc_rdma_pending_op_t *pending_op; - int ret; - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - - OBJ_RETAIN(pending_op); - - pending_op->op_result = (void *) result; - pending_op->op_size = sizeof (*result); - - /* spin until the btl has accepted the operation */ - do { - if (NULL == pending_op->op_frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); - } - if (NULL != pending_op->op_frag) { - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, pending_op->op_buffer, - address, pending_op->op_frag->handle, peer->state_handle, compare, - value, 0, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); - } - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - /* need to release here because ompi_osc_rdma_atomic_complet was not called */ - OBJ_RELEASE(pending_op); - if (OPAL_LIKELY(1 == ret)) { - ret = OMPI_SUCCESS; - } - } else { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; + return ompi_osc_rdma_btl_cswap (module, peer->state_endpoint, address, peer->state_handle, compare, value, 0, result); } /** @@ -311,7 +360,8 @@ static inline int ompi_osc_rdma_lock_try_acquire_exclusive (ompi_osc_rdma_module if (0 == lock_state) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "exclusive lock acquired"); } else { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "could not acquire exclusive lock"); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "could not acquire exclusive lock. lock state 0x%" PRIx64, + (unsigned long) lock_state); } #endif @@ -362,11 +412,14 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t * uint64_t lock = (uint64_t) (intptr_t) peer->state + offset; int ret = OMPI_SUCCESS; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d\n", lock, peer->rank); if (!ompi_osc_rdma_peer_local_state (peer)) { ret = ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE, false); + if (OMPI_SUCCESS != ret) { + abort (); + } } else { ompi_osc_rdma_unlock_local ((volatile ompi_osc_rdma_lock_t *)(intptr_t) lock); } diff --git a/ompi/mca/osc/rdma/osc_rdma_passive_target.c b/ompi/mca/osc/rdma/osc_rdma_passive_target.c index 6358020f98..37b1bee257 100644 --- a/ompi/mca/osc/rdma/osc_rdma_passive_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_passive_target.c @@ -8,7 +8,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. @@ -113,23 +113,29 @@ int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win) static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, ompi_osc_rdma_sync_t *lock) { + const int locking_mode = module->locking_mode; int ret; if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) { do { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global exclusive lock"); - /* lock the master lock. this requires no rank has a global shared lock */ - ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock), 0xffffffff00000000L); - if (OMPI_SUCCESS != ret) { - ompi_osc_rdma_progress (module); - continue; + if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) { + /* lock the master lock. this requires no rank has a global shared lock */ + ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock), + 0xffffffff00000000L); + if (OMPI_SUCCESS != ret) { + ompi_osc_rdma_progress (module); + continue; + } } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring exclusive lock on peer"); ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock)); if (ret) { /* release the global lock */ - ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock)); + if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) { + ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock)); + } ompi_osc_rdma_progress (module); continue; } @@ -157,20 +163,48 @@ static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *mo static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, ompi_osc_rdma_sync_t *lock) { + const int locking_mode = module->locking_mode; + if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock on peer"); ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock)); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock"); - ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock)); + + if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock"); + ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock)); + } + peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE; } else { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global shared lock"); ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock)); + peer->flags &= ~OMPI_OSC_RDMA_PEER_DEMAND_LOCKED; } return OMPI_SUCCESS; } +int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer) +{ + ompi_osc_rdma_sync_t *lock = &module->all_sync; + int ret = OMPI_SUCCESS; + + /* check for bad usage */ + assert (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == lock->type); + + OPAL_THREAD_SCOPED_LOCK(&peer->lock, + do { + if (!ompi_osc_rdma_peer_is_demand_locked (peer)) { + ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock); + OPAL_THREAD_SCOPED_LOCK(&lock->lock, opal_list_append (&lock->demand_locked_peers, &peer->super)); + peer->flags |= OMPI_OSC_RDMA_PEER_DEMAND_LOCKED; + } + } while (0); + ); + + return OMPI_SUCCESS; +} + int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); @@ -315,9 +349,14 @@ int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win) if (0 == (assert & MPI_MODE_NOCHECK)) { /* increment the global shared lock */ - ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL, - offsetof(ompi_osc_rdma_state_t, global_lock), - 0x00000000ffffffffUL); + if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == module->locking_mode) { + ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL, + offsetof(ompi_osc_rdma_state_t, global_lock), + 0x00000000ffffffffUL); + } else { + /* always lock myself */ + ret = ompi_osc_rdma_demand_lock_peer (module, module->my_peer); + } } if (OPAL_LIKELY(OMPI_SUCCESS != ret)) { @@ -357,8 +396,19 @@ int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win) ompi_osc_rdma_sync_rdma_complete (lock); if (0 == (lock->sync.lock.assert & MPI_MODE_NOCHECK)) { - /* decrement the master lock shared count */ - (void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL, offsetof (ompi_osc_rdma_state_t, global_lock)); + if (OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode) { + ompi_osc_rdma_peer_t *peer, *next; + + /* drop all on-demand locks */ + OPAL_LIST_FOREACH_SAFE(peer, next, &lock->demand_locked_peers, ompi_osc_rdma_peer_t) { + (void) ompi_osc_rdma_unlock_atomic_internal (module, peer, lock); + opal_list_remove_item (&lock->demand_locked_peers, &peer->super); + } + } else { + /* decrement the master lock shared count */ + (void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL, + offsetof (ompi_osc_rdma_state_t, global_lock)); + } } lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE; diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index 752b23afa6..81ed0c2d16 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -303,7 +303,7 @@ static void ompi_osc_rdma_peer_destruct (ompi_osc_rdma_peer_t *peer) } } -OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_object_t, +OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_list_item_t, ompi_osc_rdma_peer_construct, ompi_osc_rdma_peer_destruct); diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.h b/ompi/mca/osc/rdma/osc_rdma_peer.h index ad66123815..0e46ec6dfc 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.h +++ b/ompi/mca/osc/rdma/osc_rdma_peer.h @@ -22,7 +22,7 @@ struct ompi_osc_rdma_module_t; * This object is used as a cache for information associated with a peer. */ struct ompi_osc_rdma_peer_t { - opal_object_t super; + opal_list_item_t super; /** rdma data endpoint for this peer */ struct mca_btl_base_endpoint_t *data_endpoint; @@ -36,6 +36,9 @@ struct ompi_osc_rdma_peer_t { /** registration handle associated with the state */ mca_btl_base_registration_handle_t *state_handle; + /** lock to protrct peer structure */ + opal_mutex_t lock; + /** rank of this peer in the window */ int rank; @@ -134,6 +137,8 @@ enum { OMPI_OSC_RDMA_PEER_STATE_FREE = 0x20, /** peer base handle should be freed */ OMPI_OSC_RDMA_PEER_BASE_FREE = 0x40, + /** peer was demand locked as part of lock-all (when in demand locking mode) */ + OMPI_OSC_RDMA_PEER_DEMAND_LOCKED = 0x80, }; /** @@ -248,5 +253,15 @@ static inline bool ompi_osc_rdma_peer_local_state (ompi_osc_rdma_peer_t *peer) return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_STATE); } +/** + * @brief check if the peer has been demand locked as part of the current epoch + * + * @param[in] peer peer object to check + * + */ +static inline bool ompi_osc_rdma_peer_is_demand_locked (ompi_osc_rdma_peer_t *peer) +{ + return !!(peer->flags & OMPI_OSC_RDMA_PEER_DEMAND_LOCKED); +} #endif /* OMPI_OSC_RDMA_PEER_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_request.c b/ompi/mca/osc/rdma/osc_rdma_request.c index 9c032ca402..44fe9a5e8c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_request.c +++ b/ompi/mca/osc/rdma/osc_rdma_request.c @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -44,27 +44,17 @@ static int request_free(struct ompi_request_t **ompi_req) return OMPI_SUCCESS; } -static int request_complete (struct ompi_request_t *request) -{ - ompi_osc_rdma_request_t *parent_request = ((ompi_osc_rdma_request_t *) request)->parent_request; - - if (parent_request && 0 == OPAL_THREAD_ADD_FETCH32 (&parent_request->outstanding_requests, -1)) { - ompi_osc_rdma_request_complete (parent_request, OMPI_SUCCESS); - } - - return OMPI_SUCCESS; -} - static void request_construct(ompi_osc_rdma_request_t *request) { request->super.req_type = OMPI_REQUEST_WIN; request->super.req_status._cancelled = 0; request->super.req_free = request_free; request->super.req_cancel = request_cancel; - request->super.req_complete_cb = request_complete; request->parent_request = NULL; + request->to_free = NULL; request->buffer = NULL; request->internal = false; + request->cleanup = NULL; request->outstanding_requests = 0; OBJ_CONSTRUCT(&request->convertor, opal_convertor_t); } diff --git a/ompi/mca/osc/rdma/osc_rdma_request.h b/ompi/mca/osc/rdma/osc_rdma_request.h index 3cec365a7a..ad052e172c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_request.h +++ b/ompi/mca/osc/rdma/osc_rdma_request.h @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -25,26 +25,22 @@ enum ompi_osc_rdma_request_type_t { }; typedef enum ompi_osc_rdma_request_type_t ompi_osc_rdma_request_type_t; +struct ompi_osc_rdma_request_t; + +typedef void (*ompi_osc_rdma_request_cleanup_fn_t) (struct ompi_osc_rdma_request_t *); + struct ompi_osc_rdma_request_t { ompi_request_t super; ompi_osc_rdma_peer_t *peer; + ompi_osc_rdma_request_cleanup_fn_t cleanup; ompi_osc_rdma_request_type_t type; + void *to_free; void *origin_addr; - int origin_count; - struct ompi_datatype_t *origin_dt; - - void *result_addr; - int result_count; - struct ompi_datatype_t *result_dt; - - const void *compare_addr; - - ompi_op_t *op; ompi_osc_rdma_module_t *module; - int32_t outstanding_requests; + volatile int32_t outstanding_requests; bool internal; ptrdiff_t offset; @@ -69,35 +65,45 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t); rdma_rget, etc.), so it's ok to spin here... */ #define OMPI_OSC_RDMA_REQUEST_ALLOC(rmodule, rpeer, req) \ do { \ - opal_free_list_item_t *item; \ - do { \ - item = opal_free_list_get (&mca_osc_rdma_component.requests); \ - if (NULL == item) { \ - ompi_osc_rdma_progress (rmodule); \ - } \ - } while (NULL == item); \ - req = (ompi_osc_rdma_request_t*) item; \ - OMPI_REQUEST_INIT(&req->super, false); \ - req->super.req_mpi_object.win = module->win; \ - req->super.req_state = OMPI_REQUEST_ACTIVE; \ - req->module = rmodule; \ - req->peer = (rpeer); \ + (req) = OBJ_NEW(ompi_osc_rdma_request_t); \ + OMPI_REQUEST_INIT(&(req)->super, false); \ + (req)->super.req_mpi_object.win = (rmodule)->win; \ + (req)->super.req_state = OMPI_REQUEST_ACTIVE; \ + (req)->module = rmodule; \ + (req)->peer = (rpeer); \ } while (0) #define OMPI_OSC_RDMA_REQUEST_RETURN(req) \ do { \ OMPI_REQUEST_FINI(&(req)->super); \ free ((req)->buffer); \ - (req)->buffer = NULL; \ - (req)->parent_request = NULL; \ - (req)->internal = false; \ - (req)->outstanding_requests = 0; \ - opal_free_list_return (&mca_osc_rdma_component.requests, \ - (opal_free_list_item_t *) (req)); \ + free (req); \ } while (0) +static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error); + + +static inline void ompi_osc_rdma_request_deref (ompi_osc_rdma_request_t *request) +{ + if (1 == OPAL_THREAD_FETCH_ADD32 (&request->outstanding_requests, -1)) { + ompi_osc_rdma_request_complete (request, OMPI_SUCCESS); + } +} + static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error) { + ompi_osc_rdma_request_t *parent_request = request->parent_request; + + if (request->cleanup) { + request->cleanup (request); + } + + free (request->to_free); + + if (parent_request) { + ompi_osc_rdma_request_deref (parent_request); + } + if (!request->internal) { request->super.req_status.MPI_ERROR = mpi_error; diff --git a/ompi/mca/osc/rdma/osc_rdma_sync.c b/ompi/mca/osc/rdma/osc_rdma_sync.c index dca7e328d8..f07ea4f783 100644 --- a/ompi/mca/osc/rdma/osc_rdma_sync.c +++ b/ompi/mca/osc/rdma/osc_rdma_sync.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -16,15 +16,17 @@ static void ompi_osc_rdma_sync_constructor (ompi_osc_rdma_sync_t *rdma_sync) { rdma_sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE; rdma_sync->epoch_active = false; - rdma_sync->outstanding_rdma = 0; + rdma_sync->outstanding_rdma.counter = 0; OBJ_CONSTRUCT(&rdma_sync->aggregations, opal_list_t); OBJ_CONSTRUCT(&rdma_sync->lock, opal_mutex_t); + OBJ_CONSTRUCT(&rdma_sync->demand_locked_peers, opal_list_t); } static void ompi_osc_rdma_sync_destructor (ompi_osc_rdma_sync_t *rdma_sync) { OBJ_DESTRUCT(&rdma_sync->aggregations); OBJ_DESTRUCT(&rdma_sync->lock); + OBJ_DESTRUCT(&rdma_sync->demand_locked_peers); } OBJ_CLASS_INSTANCE(ompi_osc_rdma_sync_t, opal_object_t, ompi_osc_rdma_sync_constructor, diff --git a/ompi/mca/osc/rdma/osc_rdma_sync.h b/ompi/mca/osc/rdma/osc_rdma_sync.h index c4ffbbd4c3..e33b32d437 100644 --- a/ompi/mca/osc/rdma/osc_rdma_sync.h +++ b/ompi/mca/osc/rdma/osc_rdma_sync.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -33,6 +33,13 @@ typedef enum ompi_osc_rdma_sync_type_t ompi_osc_rdma_sync_type_t; struct ompi_osc_rdma_module_t; +struct ompi_osc_rdma_sync_aligned_counter_t { + volatile osc_rdma_counter_t counter; + /* pad out to next cache line */ + uint64_t padding[7]; +}; +typedef struct ompi_osc_rdma_sync_aligned_counter_t ompi_osc_rdma_sync_aligned_counter_t; + /** * @brief synchronization object * @@ -78,6 +85,9 @@ struct ompi_osc_rdma_sync_t { struct ompi_osc_rdma_peer_t *peer; } peer_list; + /** demand locked peers (lock-all) */ + opal_list_t demand_locked_peers; + /** number of peers */ int num_peers; @@ -85,7 +95,7 @@ struct ompi_osc_rdma_sync_t { bool epoch_active; /** outstanding rdma operations on epoch */ - osc_rdma_counter_t outstanding_rdma; + ompi_osc_rdma_sync_aligned_counter_t outstanding_rdma __opal_attribute_aligned__(64); /** aggregated operations in this epoch */ opal_list_t aggregations; @@ -129,30 +139,10 @@ void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync); */ bool ompi_osc_rdma_sync_pscw_peer (struct ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer); -/** - * @brief increment the outstanding rdma operation counter (atomic) - * - * @param[in] rdma_sync osc rdma synchronization object - */ -static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync) + +static inline int64_t ompi_osc_rdma_sync_get_count (ompi_osc_rdma_sync_t *rdma_sync) { - ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, 1); - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "inc: there are %ld outstanding rdma operations", - (unsigned long) rdma_sync->outstanding_rdma); -} - -/** - * @brief decrement the outstanding rdma operation counter (atomic) - * - * @param[in] rdma_sync osc rdma synchronization object - */ -static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync) -{ - ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, -1); - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "dec: there are %ld outstanding rdma operations", - (unsigned long) rdma_sync->outstanding_rdma); + return rdma_sync->outstanding_rdma.counter; } #endif /* OSC_RDMA_SYNC_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_types.h b/ompi/mca/osc/rdma/osc_rdma_types.h index fc23f0f343..e11b03b38e 100644 --- a/ompi/mca/osc/rdma/osc_rdma_types.h +++ b/ompi/mca/osc/rdma/osc_rdma_types.h @@ -205,6 +205,8 @@ typedef struct ompi_osc_rdma_aggregation_t ompi_osc_rdma_aggregation_t; OBJ_CLASS_DECLARATION(ompi_osc_rdma_aggregation_t); +typedef void (*ompi_osc_rdma_pending_op_cb_fn_t) (void *, void *, int); + struct ompi_osc_rdma_pending_op_t { opal_list_item_t super; struct ompi_osc_rdma_frag_t *op_frag; @@ -212,12 +214,29 @@ struct ompi_osc_rdma_pending_op_t { void *op_result; size_t op_size; volatile bool op_complete; + ompi_osc_rdma_pending_op_cb_fn_t cbfunc; + void *cbdata; + void *cbcontext; }; typedef struct ompi_osc_rdma_pending_op_t ompi_osc_rdma_pending_op_t; OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_op_t); +/** Communication buffer for packing messages */ +struct ompi_osc_rdma_frag_t { + opal_free_list_item_t super; + + /* Number of operations which have started writing into the frag, but not yet completed doing so */ + volatile int32_t pending; + volatile int64_t curr_index; + + struct ompi_osc_rdma_module_t *module; + mca_btl_base_registration_handle_t *handle; +}; +typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t; +OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t); + #define OSC_RDMA_VERBOSE(x, ...) OPAL_OUTPUT_VERBOSE((x, ompi_osc_base_framework.framework_output, __VA_ARGS__)) #endif /* OMPI_OSC_RDMA_TYPES_H */