openmpi/ompi/mca/coll/sm/coll_sm_barrier.c

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/** @file */

#include "ompi_config.h"

#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "opal/sys/atomic.h"
#include "coll_sm.h"

/**
 * Shared memory barrier.
 *
 * Tree-based algorithm for a barrier: a fan in to rank 0 followed by
 * a fan out using the barrier segments in the shared memory area.
 *
 * There are 2 sets of barrier buffers -- since there can only be, at
 * most, 2 outstanding barriers at any time, there is no need for more
 * than this.  The generalized in-use flags, control, and data
 * segments are not used.
 *
 * The general algorithm is for a given process to wait for its N
 * children to fan in by monitoring a uint32_t in its barrier "in"
 * buffer.  When this value reaches N (i.e., each of the children have
 * atomically incremented the value), then the process atomically
 * increases the uint32_t in its parent's "in" buffer.  Then the
 * process waits for the parent to set a "1" in the process' "out"
 * buffer.  Once this happens, the process writes a "1" in each of its
 * children's "out" buffers, and returns.
 *
 * There's corner cases, of course, such as the root that has no
 * parent, and the leaves that have no children.  But that's the
 * general idea.
 */
int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module)
{
    int rank, buffer_set;
    mca_coll_sm_comm_t *data;
    uint32_t i, num_children;
    volatile uint32_t *me_in, *me_out, *parent, *children = NULL;
    int uint_control_size;
    mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;

    /* Lazily enable the module the first time we invoke a collective
       on it */
    if (!sm_module->enabled) {
        int ret;
        if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
            return ret;
        }
    }

    uint_control_size = 
        mca_coll_sm_component.sm_control_size / sizeof(uint32_t);
    data = sm_module->sm_comm_data;
    rank = ompi_comm_rank(comm);
    num_children = data->mcb_tree[rank].mcstn_num_children;
    buffer_set = ((data->mcb_barrier_count++) % 2) * 2;
    me_in = &data->mcb_barrier_control_me[buffer_set];
    me_out = me_in + uint_control_size;
    me_out = (uint32_t*)
        (((char*) me_in) + mca_coll_sm_component.sm_control_size);

    /* Wait for my children to write to my *in* buffer */

    if (0 != num_children) {
        /* Get children *out* buffer */
        children = data->mcb_barrier_control_children + buffer_set + 
            uint_control_size;
        SPIN_CONDITION(*me_in == num_children, exit_label1);
        *me_in = 0;
    }

    /* Send to my parent and wait for a response (don't poll on
       parent's out buffer -- that would cause a lot of network
       traffic / contention / faults / etc.  Instead, children poll on
       local memory and therefore only num_children messages are sent
       across the network [vs. num_children *each* time all the
       children poll] -- i.e., the memory is only being polled by one
       process, and it is only changed *once* by an external
       process) */

    if (0 != rank) {
        /* Get parent *in* buffer */
        parent = &data->mcb_barrier_control_parent[buffer_set];
        opal_atomic_add(parent, 1);

        SPIN_CONDITION(0 != *me_out, exit_label2);
        *me_out = 0;
    }

    /* Send to my children */

    for (i = 0; i < num_children; ++i) {
        children[i * uint_control_size * 4] = 1;
    }

    /* All done!  End state of the control segment:

       me_in: 0
       me_out: 0
    */

    return OMPI_SUCCESS;
}
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`/*`
Update the copyright notices for IU and UTK. This commit was SVN r7999. 2005-11-05 22:57:48 +03:00			`* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana`
			`* University Research and Technology`
			`* Corporation. All rights reserved.`
			`* Copyright (c) 2004-2005 The University of Tennessee and The University`
			`* of Tennessee Research Foundation. All rights`
			`* reserved.`
Add HLRS copyright This commit was SVN r3665. 2004-11-28 23:09:25 +03:00			`* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,`
			`* University of Stuttgart. All rights reserved.`
Add UC copyright This commit was SVN r5009. 2005-03-24 15:43:37 +03:00			`* Copyright (c) 2004-2005 The Regents of the University of California.`
			`* All rights reserved.`
First cut at copyrights: IU, UTK, and some OSU. LANL and HLRS still pending. This commit was SVN r3655. 2004-11-22 04:38:40 +03:00			`* $COPYRIGHT$`
			`*`
			`* Additional copyrights may follow`
			`*`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`* $HEADER$`
			`*/`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/** @file */`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00
			`#include "ompi_config.h"`

Next step in the project split, mainly source code re-arranging - move files out of toplevel include/ and etc/, moving it into the sub-projects - rather than including config headers with <project>/include, have them as <project> - require all headers to be included with a project prefix, with the exception of the config headers ({opal,orte,ompi}_config.h mpi.h, and mpif.h) This commit was SVN r8985. 2006-02-12 04:33:29 +03:00			`#include "ompi/constants.h"`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`#include "ompi/communicator/communicator.h"`
			`#include "ompi/mca/coll/coll.h"`
Next step in the project split, mainly source code re-arranging - move files out of toplevel include/ and etc/, moving it into the sub-projects - rather than including config headers with <project>/include, have them as <project> - require all headers to be included with a project prefix, with the exception of the config headers ({opal,orte,ompi}_config.h mpi.h, and mpif.h) This commit was SVN r8985. 2006-02-12 04:33:29 +03:00			`#include "opal/sys/atomic.h"`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`#include "coll_sm.h"`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/**`
			`* Shared memory barrier.`
			`*`
Update the docs on the actual algorithms used This commit was SVN r7216. 2005-09-07 19:46:33 +04:00			`* Tree-based algorithm for a barrier: a fan in to rank 0 followed by`
			`* a fan out using the barrier segments in the shared memory area.`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`*`
Update the docs on the actual algorithms used This commit was SVN r7216. 2005-09-07 19:46:33 +04:00			`* There are 2 sets of barrier buffers -- since there can only be, at`
			`* most, 2 outstanding barriers at any time, there is no need for more`
			`* than this. The generalized in-use flags, control, and data`
			`* segments are not used.`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`*`
Update the docs on the actual algorithms used This commit was SVN r7216. 2005-09-07 19:46:33 +04:00			`* The general algorithm is for a given process to wait for its N`
			`* children to fan in by monitoring a uint32_t in its barrier "in"`
			`* buffer. When this value reaches N (i.e., each of the children have`
			`* atomically incremented the value), then the process atomically`
			`* increases the uint32_t in its parent's "in" buffer. Then the`
			`* process waits for the parent to set a "1" in the process' "out"`
			`* buffer. Once this happens, the process writes a "1" in each of its`
			`* children's "out" buffers, and returns.`
			`*`
			`* There's corner cases, of course, such as the root that has no`
			`* parent, and the leaves that have no children. But that's the`
			`* general idea.`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`*/`
Update collectives selection logic to allow for multiple components to be used at nce (up to one unique collective module per collective function). Matches r15795:15921 of the tmp/bwb-coll-select branch This commit was SVN r15924. The following SVN revisions from the original message are invalid or inconsistent and therefore were not cross-referenced: r15795 r15921 2007-08-19 07:37:49 +04:00			`int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm,`
Fixes trac:1392, #1400 * add "register" function to mca_base_component_t * converted coll:basic and paffinity:linux and paffinity:solaris to use this function * we'll convert the rest over time (I'll file a ticket once all this is committed) * add 32 bytes of "reserved" space to the end of mca_base_component_t and mca_base_component_data_2_0_0_t to make future upgrades [slightly] easier * new mca_base_component_t size: 196 bytes * new mca_base_component_data_2_0_0_t size: 36 bytes * MCA base version bumped to v2.0 * '''We now refuse to load components that are not MCA v2.0.x''' * all MCA frameworks versions bumped to v2.0 * be a little more explicit about version numbers in the MCA base * add big comment in mca.h about versioning philosophy This commit was SVN r19073. The following Trac tickets were found above: Ticket 1392 --> https://svn.open-mpi.org/trac/ompi/ticket/1392 2008-07-29 02:40:57 +04:00			`mca_coll_base_module_t *module)`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`{`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`int rank, buffer_set;`
Update collectives selection logic to allow for multiple components to be used at nce (up to one unique collective module per collective function). Matches r15795:15921 of the tmp/bwb-coll-select branch This commit was SVN r15924. The following SVN revisions from the original message are invalid or inconsistent and therefore were not cross-referenced: r15795 r15921 2007-08-19 07:37:49 +04:00			`mca_coll_sm_comm_t *data;`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`uint32_t i, num_children;`
changed some barrier variables for shared-memory to volatile This commit was SVN r11403. 2006-08-24 20:53:10 +04:00			`volatile uint32_t me_in, me_out, parent, children = NULL;`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`int uint_control_size;`
Update collectives selection logic to allow for multiple components to be used at nce (up to one unique collective module per collective function). Matches r15795:15921 of the tmp/bwb-coll-select branch This commit was SVN r15924. The following SVN revisions from the original message are invalid or inconsistent and therefore were not cross-referenced: r15795 r15921 2007-08-19 07:37:49 +04:00			`mca_coll_sm_module_t sm_module = (mca_coll_sm_module_t) module;`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00
Refs trac:2023, #2027. This commit does a bunch of things: * Address all remaining code review items from CMR #2023: * Defer mmap setup to be lazy; only set it up the first time we invoke a collective. In this way, we don't penalize apps that make lots of communicators but don't invoke collectives on them (per #2027). * Remove the extra assignments of mca_coll_sm_one (fixing a convertor count setup that was the real problem). * Remove another extra/unnecessary assignment. * Increase libevent polling frequency when using the RML to bootstrap mmap'ed memory. * Fix a minor procs-related memory leak in btl_sm. * Commit a datatype fix that George and I discovered along the way to fixing the coll sm. * Improve error messages when mmap fails, potentially trying to de-alloc any allocated memory when that happens. * Fix a previously-unnoticed confusion between extent and true_extent in coll sm reduce. This commit was SVN r22049. The following Trac tickets were found above: Ticket 2023 --> https://svn.open-mpi.org/trac/ompi/ticket/2023 2009-10-02 21:13:56 +04:00			`/* Lazily enable the module the first time we invoke a collective`
			`on it */`
			`if (!sm_module->enabled) {`
			`int ret;`
			`if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {`
			`return ret;`
			`}`
			`}`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`uint_control_size =`
			`mca_coll_sm_component.sm_control_size / sizeof(uint32_t);`
Fixes trac:1988. The little bug that turned out to be huge. Yoinks. * Various cosmetic/style updates in the btl sm * Clean up concept of mpool module (I think that code was written way back when the concept of "modules" was fuzzy) * Bring over some old fixes from the /tmp/timattox-sm-coll/ tree to fix potential segv's when mmap'ed regions were at different addresses in different processes (thanks Tim!). * Change sm coll to no longer use mpool as its main source of shmem; rather, just mmap its own segment (because it's fixed size -- there was nothing to be gained by using mpool; shedding the use of mpool saved a lot of complexity in the sm coll setup). This effectively made Tim's fixes moot (because now everything is an offset into the mmap that is computed locally; there are no global pointers). :-) * Slightly updated common/sm to allow making mmap's for a specific set of procs (vs. ''all'' procs in the process). This potentially allows for same-host-inter-proc mmaps -- yay! * Fixed many, many things in the coll sm (particularly in reduce): * Fixed handling of MPI_IN_PLACE in reduce and allreduce * Fixed handling of non-contiguous datatypes in reduce * Changed the order of reductions to go from process (n-1)'s data to process 0's data, because that's how all other OMPI coll components work * Fixed lots of usage of ddt functions * When using a non-contiguous datatype, if the root process is not (n-1), now we used a 2nd convertor to copy from shmem to the rbuf (saves a memory copy vs. what was done before) * Lots and lots of little cleanups, clarifications, and minor optimizations (although still more could be done -- e.g., I think the use of write memory barriers is fairly sub-optimal; they could be ganged together at the root, for example) I'm marking this as "fixes trac:1988" and closing the ticket; if something is still broken, we can re-open the ticket. This commit was SVN r21967. The following Trac tickets were found above: Ticket 1988 --> https://svn.open-mpi.org/trac/ompi/ticket/1988 2009-09-15 04:25:21 +04:00			`data = sm_module->sm_comm_data;`
Now pre-compute some things rather than compute them during every barrier This commit was SVN r6988. 2005-08-24 02:02:28 +04:00			`rank = ompi_comm_rank(comm);`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`num_children = data->mcb_tree[rank].mcstn_num_children;`
			`buffer_set = ((data->mcb_barrier_count++) % 2) * 2;`
			`me_in = &data->mcb_barrier_control_me[buffer_set];`
			`me_out = me_in + uint_control_size;`
			`me_out = (uint32_t*)`
			`(((char*) me_in) + mca_coll_sm_component.sm_control_size);`

			`/* Wait for my children to write to my in buffer */`

			`if (0 != num_children) {`
			`/* Get children out buffer */`
			`children = data->mcb_barrier_control_children + buffer_set +`
			`uint_control_size;`
A slight optimization: no longer call sched_yield() when polling for shmem progress (or the Windows equiv). Instead, poll hard on the condition, but periocially call opal_progress(). This allows badly-formed apps (e.g., the ibm test communicator/bsend_free) to actually complete. To be clear, there are far too many apps out there that assume that MPI collectives will actually progress the rest of MPI. I don't like putting in a feature to enable broken apps, but I have a dim recollection of this issue coming up before (apps "hanging" when testing the sm coll because they assumed that calling collectives would trigger other MPI progress). Rather than have people claim that OMPI is broken, I prefer to put in this "workaround". :-( Indeed, the bsend_free test ''may'' be coded that way for exactly that reason...? I don't remember offhand... This commit was SVN r21984. 2009-09-22 02:20:44 +04:00			`SPIN_CONDITION(*me_in == num_children, exit_label1);`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`*me_in = 0;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`}`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/* Send to my parent and wait for a response (don't poll on`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`parent's out buffer -- that would cause a lot of network`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`traffic / contention / faults / etc. Instead, children poll on`
			`local memory and therefore only num_children messages are sent`
			`across the network [vs. num_children each time all the`
			`children poll] -- i.e., the memory is only being polled by one`
			`process, and it is only changed once by an external`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`process) */`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`if (0 != rank) {`
			`/* Get parent in buffer */`
			`parent = &data->mcb_barrier_control_parent[buffer_set];`
			`opal_atomic_add(parent, 1);`

A slight optimization: no longer call sched_yield() when polling for shmem progress (or the Windows equiv). Instead, poll hard on the condition, but periocially call opal_progress(). This allows badly-formed apps (e.g., the ibm test communicator/bsend_free) to actually complete. To be clear, there are far too many apps out there that assume that MPI collectives will actually progress the rest of MPI. I don't like putting in a feature to enable broken apps, but I have a dim recollection of this issue coming up before (apps "hanging" when testing the sm coll because they assumed that calling collectives would trigger other MPI progress). Rather than have people claim that OMPI is broken, I prefer to put in this "workaround". :-( Indeed, the bsend_free test ''may'' be coded that way for exactly that reason...? I don't remember offhand... This commit was SVN r21984. 2009-09-22 02:20:44 +04:00			`SPIN_CONDITION(0 != *me_out, exit_label2);`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`*me_out = 0;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`}`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/* Send to my children */`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`for (i = 0; i < num_children; ++i) {`
			`children[i * uint_control_size * 4] = 1;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`}`

			`/* All done! End state of the control segment:`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`me_in: 0`
			`me_out: 0`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`*/`

			`return OMPI_SUCCESS;`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`}`