openmpi/ompi/mca/osc/rdma/osc_rdma_sync.h

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2015-2018 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#if !defined(OSC_RDMA_SYNC_H)
#define OSC_RDMA_SYNC_H

#include "osc_rdma_types.h"
#include "opal/class/opal_object.h"
#include "opal/threads/threads.h"

/**
 * @brief synchronization types
 */
enum ompi_osc_rdma_sync_type_t {
    /** default value */
    OMPI_OSC_RDMA_SYNC_TYPE_NONE,
    /** lock access epoch */
    OMPI_OSC_RDMA_SYNC_TYPE_LOCK,
    /** fence access epoch */
    OMPI_OSC_RDMA_SYNC_TYPE_FENCE,
    /* post-start-complete-wait access epoch */
    OMPI_OSC_RDMA_SYNC_TYPE_PSCW,
};
typedef enum ompi_osc_rdma_sync_type_t ompi_osc_rdma_sync_type_t;

struct ompi_osc_rdma_module_t;

struct ompi_osc_rdma_sync_aligned_counter_t {
    osc_rdma_atomic_counter_t counter;
    /* pad out to next cache line */
    uint64_t padding[7];
};
typedef struct ompi_osc_rdma_sync_aligned_counter_t ompi_osc_rdma_sync_aligned_counter_t;

/**
 * @brief synchronization object
 *
 * This structure holds information about an access epoch.
 */
struct ompi_osc_rdma_sync_t {
    opal_object_t super;

    /** osc rdma module */
    struct ompi_osc_rdma_module_t *module;

    /** synchronization type */
    ompi_osc_rdma_sync_type_t type;

    /** synchronization data */
    union {
        /** lock specific synchronization data */
        struct {
            /** lock target rank (-1 for all) */
            int target;

            /** lock type: MPI_LOCK_SHARED, MPI_LOCK_EXCLUSIVE */
            int16_t type;

            /** assert specified at lock acquire time. at this time Open MPI
             * only uses 5-bits for asserts. if this number goes over 16 this
             * will need to be changed to accomodate. */
            int16_t assert;
        } lock;

        /** post/start/complete/wait specific synchronization data */
        struct {
            /** group passed to ompi_osc_rdma_start */
            ompi_group_t *group;
        } pscw;
    } sync;

    /** array of peers for this sync */
    union {
        /** multiple peers (lock all, pscw, fence) */
	struct ompi_osc_rdma_peer_t **peers;
        /** single peer (targeted lock) */
	struct ompi_osc_rdma_peer_t *peer;
    } peer_list;

    /** demand locked peers (lock-all) */
    opal_list_t demand_locked_peers;

    /** number of peers */
    int num_peers;

    /** communication has started on this epoch */
    bool epoch_active;

    /** outstanding rdma operations on epoch */
    ompi_osc_rdma_sync_aligned_counter_t outstanding_rdma __opal_attribute_aligned__(64);

    /** lock to protect sync structure members */
    opal_mutex_t lock;
};
typedef struct ompi_osc_rdma_sync_t ompi_osc_rdma_sync_t;

OBJ_CLASS_DECLARATION(ompi_osc_rdma_sync_t);

/**
 * @brief allocate a new synchronization object
 *
 * @param[in] module   osc rdma module
 *
 * @returns NULL on failure
 * @returns a new synchronization object on success
 */
ompi_osc_rdma_sync_t *ompi_osc_rdma_sync_allocate (struct ompi_osc_rdma_module_t *module);

/**
 * @brief release a synchronization object
 *
 * @param[in] rdma_sync   synchronization object allocated by ompi_osc_rdma_sync_allocate()
 */
void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync);

/**
 * Check if the target is part of a PSCW access epoch
 *
 * @param[in] module   osc rdma module
 * @param[in] target   target rank
 * @param[out] peer    peer object
 *
 * @returns false if the window is not in a PSCW access epoch or the peer is not
 *          in the group passed to MPI_Win_start
 * @returns true otherwise
 *
 * This functions verifies the target is part of an active PSCW access epoch.
 */
bool ompi_osc_rdma_sync_pscw_peer (struct ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer);


static inline int64_t ompi_osc_rdma_sync_get_count (ompi_osc_rdma_sync_t *rdma_sync)
{
    return rdma_sync->outstanding_rdma.counter;
}

#endif /* OSC_RDMA_SYNC_H */
Revert "Update to sync with OMPI master and cleanup to build" This reverts commit cb55c88a8b7817d5891ff06a447ea190b0e77479. 2016-11-22 15:03:20 -08:00			`/* -- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -- */`
			`/*`
osc/rdma: performance improvments and bug fixes This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-03-15 12:20:41 -06:00			`* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights`
Revert "Update to sync with OMPI master and cleanup to build" This reverts commit cb55c88a8b7817d5891ff06a447ea190b0e77479. 2016-11-22 15:03:20 -08:00			`* reserved.`
			`* $COPYRIGHT$`
			`*`
			`* Additional copyrights may follow`
			`*`
			`* $HEADER$`
			`*/`

			`#if !defined(OSC_RDMA_SYNC_H)`
			`#define OSC_RDMA_SYNC_H`

			`#include "osc_rdma_types.h"`
			`#include "opal/class/opal_object.h"`
			`#include "opal/threads/threads.h"`

			`/**`
			`* @brief synchronization types`
			`*/`
			`enum ompi_osc_rdma_sync_type_t {`
			`/** default value */`
			`OMPI_OSC_RDMA_SYNC_TYPE_NONE,`
			`/** lock access epoch */`
			`OMPI_OSC_RDMA_SYNC_TYPE_LOCK,`
			`/** fence access epoch */`
			`OMPI_OSC_RDMA_SYNC_TYPE_FENCE,`
			`/* post-start-complete-wait access epoch */`
			`OMPI_OSC_RDMA_SYNC_TYPE_PSCW,`
			`};`
			`typedef enum ompi_osc_rdma_sync_type_t ompi_osc_rdma_sync_type_t;`

			`struct ompi_osc_rdma_module_t;`

osc/rdma: performance improvments and bug fixes This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-03-15 12:20:41 -06:00			`struct ompi_osc_rdma_sync_aligned_counter_t {`
opal: add types for atomic variables This commit updates the entire codebase to use specific opal types for all atomic variables. This is a change from the prior atomic support which required the use of the volatile keyword. This is the first step towards implementing support for C11 atomics as that interface requires the use of types declared with the _Atomic keyword. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-07-11 13:34:03 -06:00			`osc_rdma_atomic_counter_t counter;`
osc/rdma: performance improvments and bug fixes This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-03-15 12:20:41 -06:00			`/* pad out to next cache line */`
			`uint64_t padding[7];`
			`};`
			`typedef struct ompi_osc_rdma_sync_aligned_counter_t ompi_osc_rdma_sync_aligned_counter_t;`

Revert "Update to sync with OMPI master and cleanup to build" This reverts commit cb55c88a8b7817d5891ff06a447ea190b0e77479. 2016-11-22 15:03:20 -08:00			`/**`
			`* @brief synchronization object`
			`*`
			`* This structure holds information about an access epoch.`
			`*/`
			`struct ompi_osc_rdma_sync_t {`
			`opal_object_t super;`

			`/** osc rdma module */`
			`struct ompi_osc_rdma_module_t *module;`

			`/** synchronization type */`
			`ompi_osc_rdma_sync_type_t type;`

			`/** synchronization data */`
			`union {`
			`/** lock specific synchronization data */`
			`struct {`
			`/** lock target rank (-1 for all) */`
			`int target;`

			`/** lock type: MPI_LOCK_SHARED, MPI_LOCK_EXCLUSIVE */`
			`int16_t type;`

			`/** assert specified at lock acquire time. at this time Open MPI`
			`* only uses 5-bits for asserts. if this number goes over 16 this`
			`* will need to be changed to accomodate. */`
			`int16_t assert;`
			`} lock;`

			`/** post/start/complete/wait specific synchronization data */`
			`struct {`
			`/** group passed to ompi_osc_rdma_start */`
			`ompi_group_t *group;`
			`} pscw;`
			`} sync;`

			`/** array of peers for this sync */`
			`union {`
			`/** multiple peers (lock all, pscw, fence) */`
			`struct ompi_osc_rdma_peer_t **peers;`
			`/** single peer (targeted lock) */`
			`struct ompi_osc_rdma_peer_t *peer;`
			`} peer_list;`

osc/rdma: performance improvments and bug fixes This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-03-15 12:20:41 -06:00			`/** demand locked peers (lock-all) */`
			`opal_list_t demand_locked_peers;`

Revert "Update to sync with OMPI master and cleanup to build" This reverts commit cb55c88a8b7817d5891ff06a447ea190b0e77479. 2016-11-22 15:03:20 -08:00			`/** number of peers */`
			`int num_peers;`

			`/** communication has started on this epoch */`
			`bool epoch_active;`

			`/** outstanding rdma operations on epoch */`
osc/rdma: performance improvments and bug fixes This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-03-15 12:20:41 -06:00			`ompi_osc_rdma_sync_aligned_counter_t outstanding_rdma __opal_attribute_aligned__(64);`
Revert "Update to sync with OMPI master and cleanup to build" This reverts commit cb55c88a8b7817d5891ff06a447ea190b0e77479. 2016-11-22 15:03:20 -08:00
			`/** lock to protect sync structure members */`
			`opal_mutex_t lock;`
			`};`
			`typedef struct ompi_osc_rdma_sync_t ompi_osc_rdma_sync_t;`

			`OBJ_CLASS_DECLARATION(ompi_osc_rdma_sync_t);`

			`/**`
			`* @brief allocate a new synchronization object`
			`*`
			`* @param[in] module osc rdma module`
			`*`
			`* @returns NULL on failure`
			`* @returns a new synchronization object on success`
			`*/`
			`ompi_osc_rdma_sync_t ompi_osc_rdma_sync_allocate (struct ompi_osc_rdma_module_t module);`

			`/**`
			`* @brief release a synchronization object`
			`*`
			`* @param[in] rdma_sync synchronization object allocated by ompi_osc_rdma_sync_allocate()`
			`*/`
			`void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync);`

			`/**`
			`* Check if the target is part of a PSCW access epoch`
			`*`
			`* @param[in] module osc rdma module`
			`* @param[in] target target rank`
			`* @param[out] peer peer object`
			`*`
			`* @returns false if the window is not in a PSCW access epoch or the peer is not`
			`* in the group passed to MPI_Win_start`
			`* @returns true otherwise`
			`*`
			`* This functions verifies the target is part of an active PSCW access epoch.`
			`*/`
			`bool ompi_osc_rdma_sync_pscw_peer (struct ompi_osc_rdma_module_t module, int target, struct ompi_osc_rdma_peer_t *peer);`


osc/rdma: performance improvments and bug fixes This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-03-15 12:20:41 -06:00			`static inline int64_t ompi_osc_rdma_sync_get_count (ompi_osc_rdma_sync_t *rdma_sync)`
Revert "Update to sync with OMPI master and cleanup to build" This reverts commit cb55c88a8b7817d5891ff06a447ea190b0e77479. 2016-11-22 15:03:20 -08:00			`{`
osc/rdma: performance improvments and bug fixes This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> 2018-03-15 12:20:41 -06:00			`return rdma_sync->outstanding_rdma.counter;`
Revert "Update to sync with OMPI master and cleanup to build" This reverts commit cb55c88a8b7817d5891ff06a447ea190b0e77479. 2016-11-22 15:03:20 -08:00			`}`

			`#endif /* OSC_RDMA_SYNC_H */`