1
1
openmpi/ompi/mca/osc/rdma/osc_rdma_sync.h
Nathan Hjelm 7f4872d483 osc/rdma: performance improvments and bug fixes
This commit is a large update to the osc/rdma component. Included in
this commit:

 - Add support for using hardware atomics for fetch-and-op and single
   count accumulate  when using the accumulate lock. This will improve
   the performance of these operations even when not setting the
   single intrinsic info key.

 - Rework how large accumulates are done. They now block on the get
   operation to fix some bugs discovered by an IBM one-sided test. I
   may roll back some of the changes if the underlying bug in the
   original design is discovered. There appear to be no real
   difference (on the hardware this was tested with) in performance so
   its probably a non-issue. References #2530.

 - Add support for an additional lock-all algorithm: on-demand. The
   on-demand algorithm will attempt to acquire the peer lock when
   starting an RMA operation. The lock algorithm default has not
   changed. The algorithm can be selected by setting the
   osc_rdma_locking_mode MCA variable. The valid values are two_level
   and on_demand.

 - Make use of the btl_flush function if available. This can improve
   performance with some btls.

 - When using btl_flush do not keep track of the number of put
   operations. This reduces the number of atomic operations in the
   critical path.

 - Make the window buffers more friendly to multi-threaded
   applications. This was done by dropping support for multiple
   buffers per MPI window. I intend to re-add that support once the
   underlying performance bug under the old buffering scheme is
   fixed.

 - Fix a bug in request completion in the accumulate, get, and put
   paths. This also helps with #2530.

 - General code cleanup and fixes.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2018-03-15 14:53:53 -06:00

149 строки
4.1 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(OSC_RDMA_SYNC_H)
#define OSC_RDMA_SYNC_H
#include "osc_rdma_types.h"
#include "opal/class/opal_object.h"
#include "opal/threads/threads.h"
/**
* @brief synchronization types
*/
enum ompi_osc_rdma_sync_type_t {
/** default value */
OMPI_OSC_RDMA_SYNC_TYPE_NONE,
/** lock access epoch */
OMPI_OSC_RDMA_SYNC_TYPE_LOCK,
/** fence access epoch */
OMPI_OSC_RDMA_SYNC_TYPE_FENCE,
/* post-start-complete-wait access epoch */
OMPI_OSC_RDMA_SYNC_TYPE_PSCW,
};
typedef enum ompi_osc_rdma_sync_type_t ompi_osc_rdma_sync_type_t;
struct ompi_osc_rdma_module_t;
struct ompi_osc_rdma_sync_aligned_counter_t {
volatile osc_rdma_counter_t counter;
/* pad out to next cache line */
uint64_t padding[7];
};
typedef struct ompi_osc_rdma_sync_aligned_counter_t ompi_osc_rdma_sync_aligned_counter_t;
/**
* @brief synchronization object
*
* This structure holds information about an access epoch.
*/
struct ompi_osc_rdma_sync_t {
opal_object_t super;
/** osc rdma module */
struct ompi_osc_rdma_module_t *module;
/** synchronization type */
ompi_osc_rdma_sync_type_t type;
/** synchronization data */
union {
/** lock specific synchronization data */
struct {
/** lock target rank (-1 for all) */
int target;
/** lock type: MPI_LOCK_SHARED, MPI_LOCK_EXCLUSIVE */
int16_t type;
/** assert specified at lock acquire time. at this time Open MPI
* only uses 5-bits for asserts. if this number goes over 16 this
* will need to be changed to accomodate. */
int16_t assert;
} lock;
/** post/start/complete/wait specific synchronization data */
struct {
/** group passed to ompi_osc_rdma_start */
ompi_group_t *group;
} pscw;
} sync;
/** array of peers for this sync */
union {
/** multiple peers (lock all, pscw, fence) */
struct ompi_osc_rdma_peer_t **peers;
/** single peer (targeted lock) */
struct ompi_osc_rdma_peer_t *peer;
} peer_list;
/** demand locked peers (lock-all) */
opal_list_t demand_locked_peers;
/** number of peers */
int num_peers;
/** communication has started on this epoch */
bool epoch_active;
/** outstanding rdma operations on epoch */
ompi_osc_rdma_sync_aligned_counter_t outstanding_rdma __opal_attribute_aligned__(64);
/** aggregated operations in this epoch */
opal_list_t aggregations;
/** lock to protect sync structure members */
opal_mutex_t lock;
};
typedef struct ompi_osc_rdma_sync_t ompi_osc_rdma_sync_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_sync_t);
/**
* @brief allocate a new synchronization object
*
* @param[in] module osc rdma module
*
* @returns NULL on failure
* @returns a new synchronization object on success
*/
ompi_osc_rdma_sync_t *ompi_osc_rdma_sync_allocate (struct ompi_osc_rdma_module_t *module);
/**
* @brief release a synchronization object
*
* @param[in] rdma_sync synchronization object allocated by ompi_osc_rdma_sync_allocate()
*/
void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync);
/**
* Check if the target is part of a PSCW access epoch
*
* @param[in] module osc rdma module
* @param[in] target target rank
* @param[out] peer peer object
*
* @returns false if the window is not in a PSCW access epoch or the peer is not
* in the group passed to MPI_Win_start
* @returns true otherwise
*
* This functions verifies the target is part of an active PSCW access epoch.
*/
bool ompi_osc_rdma_sync_pscw_peer (struct ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer);
static inline int64_t ompi_osc_rdma_sync_get_count (ompi_osc_rdma_sync_t *rdma_sync)
{
return rdma_sync->outstanding_rdma.counter;
}
#endif /* OSC_RDMA_SYNC_H */