openmpi/ompi/mca/coll/sm/coll_sm_barrier.c

/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University.
 *                         All rights reserved.
 * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
 *                         All rights reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/** @file */

#include "ompi_config.h"

#include "ompi/include/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "opal/include/sys/atomic.h"
#include "coll_sm.h"

/**
 * Shared memory barrier.
 *
 * Tree-based algorithm for a barrier -- the general scheme is a fan
 * in to rank 0 followed by a fan out using the control segments in
 * the shared memory area.  The data segments are not used.
 *
 * The general algorithm is to wait for all N children to report in by
 * atomically increasing a uint32_t in my "in" control segment.  Once
 * that value equals N, I atomically increase the corresponding number
 * in my parent's "in" control segment.
 *
 * If I have no parent and all N children have reported in, then I
 * write a 1 into each of my children's "out" control segments.  Once
 * the children see the 1, they do the same to their children.
 */
int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm)
{
    int rank, buffer_set;
    mca_coll_base_comm_t *data;
    uint32_t i, num_children;
    uint32_t *me_in, *me_out, *parent, *children;
    int uint_control_size;

    uint_control_size = 
        mca_coll_sm_component.sm_control_size / sizeof(uint32_t);
    data = comm->c_coll_selected_data;
    rank = ompi_comm_rank(comm);
    num_children = data->mcb_tree[rank].mcstn_num_children;
    buffer_set = ((data->mcb_barrier_count++) % 2) * 2;
    me_in = &data->mcb_barrier_control_me[buffer_set];
    me_out = me_in + uint_control_size;
    me_out = (uint32_t*)
        (((char*) me_in) + mca_coll_sm_component.sm_control_size);
    D(("rank %d barrier set %d: in %p, out %p\n", rank, buffer_set, me_in, me_out));

    /* Wait for my children to write to my *in* buffer */

    if (0 != num_children) {
        /* Get children *out* buffer */
        children = data->mcb_barrier_control_children + buffer_set + 
            uint_control_size;
        D(("rank %d waiting for fan in from %d children...\n", rank, num_children));
        while (*me_in != num_children) {
            continue;
        }
        *me_in = 0;
        D(("rank %d got fan in\n", rank));
    }

    /* Send to my parent and wait for a response (don't poll on
       parent's out buffer -- that would cause a lot of network
       traffic / contention / faults / etc.  Instead, children poll on
       local memory and therefore only num_children messages are sent
       across the network [vs. num_children *each* time all the
       children poll] -- i.e., the memory is only being polled by one
       process, and it is only changed *once* by an external
       process) */

    if (0 != rank) {
        /* Get parent *in* buffer */
        parent = &data->mcb_barrier_control_parent[buffer_set];
        D(("rank %d writing to parent\n", rank));
        opal_atomic_add(parent, 1);

        D(("rank %d waiting for fan out from parent: %p\n", rank, me_out));
        while (0 == *me_out) {
            continue;
        }
        D(("rank %d got fan out from parent\n", rank));
        *me_out = 0;
    }

    /* Send to my children */

    for (i = 0; i < num_children; ++i) {
        D(("rank %d sending to child %d: %p\n", rank, i,
           children + (i * uint_control_size * 4)));
        children[i * uint_control_size * 4] = 1;
    }
    D(("rank %d done with barrier\n", rank));

    /* All done!  End state of the control segment:

       me_in: 0
       me_out: 0
    */

    return OMPI_SUCCESS;
}
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`/*`
First cut at copyrights: IU, UTK, and some OSU. LANL and HLRS still pending. This commit was SVN r3655. 2004-11-22 04:38:40 +03:00			`* Copyright (c) 2004-2005 The Trustees of Indiana University.`
			`* All rights reserved.`
			`* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.`
			`* All rights reserved.`
Add HLRS copyright This commit was SVN r3665. 2004-11-28 23:09:25 +03:00			`* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,`
			`* University of Stuttgart. All rights reserved.`
Add UC copyright This commit was SVN r5009. 2005-03-24 15:43:37 +03:00			`* Copyright (c) 2004-2005 The Regents of the University of California.`
			`* All rights reserved.`
First cut at copyrights: IU, UTK, and some OSU. LANL and HLRS still pending. This commit was SVN r3655. 2004-11-22 04:38:40 +03:00			`* $COPYRIGHT$`
			`*`
			`* Additional copyrights may follow`
			`*`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`* $HEADER$`
			`*/`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/** @file */`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00
			`#include "ompi_config.h"`

Ensure to get ompi/include/constants.h This commit was SVN r6845. 2005-08-13 01:42:07 +04:00			`#include "ompi/include/constants.h"`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`#include "ompi/communicator/communicator.h"`
			`#include "ompi/mca/coll/coll.h"`
			`#include "opal/include/sys/atomic.h"`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`#include "coll_sm.h"`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/**`
			`* Shared memory barrier.`
			`*`
			`* Tree-based algorithm for a barrier -- the general scheme is a fan`
			`* in to rank 0 followed by a fan out using the control segments in`
			`* the shared memory area. The data segments are not used.`
			`*`
			`* The general algorithm is to wait for all N children to report in by`
			`* atomically increasing a uint32_t in my "in" control segment. Once`
			`* that value equals N, I atomically increase the corresponding number`
			`* in my parent's "in" control segment.`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`*`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`* If I have no parent and all N children have reported in, then I`
			`* write a 1 into each of my children's "out" control segments. Once`
			`* the children see the 1, they do the same to their children.`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`*/`
Commit second cut of the sm component shell -- nothing implemented yet. This commit was SVN r4247. 2005-01-30 04:42:57 +03:00			`int mca_coll_sm_barrier_intra(struct ompi_communicator_t *comm)`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`{`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`int rank, buffer_set;`
			`mca_coll_base_comm_t *data;`
			`uint32_t i, num_children;`
			`uint32_t me_in, me_out, parent, children;`
			`int uint_control_size;`

			`uint_control_size =`
			`mca_coll_sm_component.sm_control_size / sizeof(uint32_t);`
			`data = comm->c_coll_selected_data;`
Now pre-compute some things rather than compute them during every barrier This commit was SVN r6988. 2005-08-24 02:02:28 +04:00			`rank = ompi_comm_rank(comm);`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`num_children = data->mcb_tree[rank].mcstn_num_children;`
			`buffer_set = ((data->mcb_barrier_count++) % 2) * 2;`
			`me_in = &data->mcb_barrier_control_me[buffer_set];`
			`me_out = me_in + uint_control_size;`
			`me_out = (uint32_t*)`
			`(((char*) me_in) + mca_coll_sm_component.sm_control_size);`
			`D(("rank %d barrier set %d: in %p, out %p\n", rank, buffer_set, me_in, me_out));`

			`/* Wait for my children to write to my in buffer */`

			`if (0 != num_children) {`
			`/* Get children out buffer */`
			`children = data->mcb_barrier_control_children + buffer_set +`
			`uint_control_size;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`D(("rank %d waiting for fan in from %d children...\n", rank, num_children));`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`while (*me_in != num_children) {`
			`continue;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`}`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`*me_in = 0;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`D(("rank %d got fan in\n", rank));`
			`}`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/* Send to my parent and wait for a response (don't poll on`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`parent's out buffer -- that would cause a lot of network`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`traffic / contention / faults / etc. Instead, children poll on`
			`local memory and therefore only num_children messages are sent`
			`across the network [vs. num_children each time all the`
			`children poll] -- i.e., the memory is only being polled by one`
			`process, and it is only changed once by an external`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`process) */`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`if (0 != rank) {`
			`/* Get parent in buffer */`
			`parent = &data->mcb_barrier_control_parent[buffer_set];`
			`D(("rank %d writing to parent\n", rank));`
			`opal_atomic_add(parent, 1);`

			`D(("rank %d waiting for fan out from parent: %p\n", rank, me_out));`
			`while (0 == *me_out) {`
			`continue;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`}`
			`D(("rank %d got fan out from parent\n", rank));`
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`*me_out = 0;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`}`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`/* Send to my children */`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00
Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`for (i = 0; i < num_children; ++i) {`
			`D(("rank %d sending to child %d: %p\n", rank, i,`
			`children + (i * uint_control_size * 4)));`
			`children[i * uint_control_size * 4] = 1;`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`}`
			`D(("rank %d done with barrier\n", rank));`

			`/* All done! End state of the control segment:`

Tree-based barrier and broadcast seem to be working. Now on to reduce / allreduce... This commit was SVN r7149. 2005-09-02 16:57:47 +04:00			`me_in: 0`
			`me_out: 0`
First cut of sm coll component infrastrcutre (this is what took so much time) and somewhat-lame implementation of barrier (need to precompute some more stuff rather than calculate it every time). Checkpointing so I can try this on another machine... This commit was SVN r6985. 2005-08-24 01:22:00 +04:00			`*/`

			`return OMPI_SUCCESS;`
Template functions for Shared Memory (SM) collective module - no implementation This commit was SVN r2252. 2004-08-21 04:49:07 +04:00			`}`