diff --git a/ompi/mca/bcol/base/bcol_base_frame.c b/ompi/mca/bcol/base/bcol_base_frame.c
index 4b829bd6a1..ee421af295 100644
--- a/ompi/mca/bcol/base/bcol_base_frame.c
+++ b/ompi/mca/bcol/base/bcol_base_frame.c
@@ -1,7 +1,10 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
- * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -125,7 +128,6 @@ static int mca_bcol_base_set_components_to_use(opal_list_t *bcol_components_avai
                 opal_list_t *bcol_components_in_use)
 {
     /* local variables */
-    opal_list_item_t *b_item;
     const mca_base_component_t *b_component;
 
     mca_base_component_list_item_t *b_cli;
@@ -156,13 +158,10 @@ static int mca_bcol_base_set_components_to_use(opal_list_t *bcol_components_avai
     /* loop over list of components requested */
     for (i = 0; i < cnt; i++) {
         /* loop over discovered components */
-        for (b_item = opal_list_get_first(bcol_components_avail);
-                opal_list_get_end(bcol_components_avail) != b_item;
-                b_item = opal_list_get_next(b_item)) {
-
-            b_cli = (mca_base_component_list_item_t *) b_item;
+        OPAL_LIST_FOREACH(b_cli, bcol_components_avail, mca_base_component_list_item_t) {
             b_component = b_cli->cli_component;
 
+
             b_component_name = b_component->mca_component_name;
             b_str_len = strlen(b_component_name);
 
diff --git a/ompi/mca/bcol/base/bcol_base_init.c b/ompi/mca/bcol/base/bcol_base_init.c
index a3850369b0..5eaa8ab6d1 100644
--- a/ompi/mca/bcol/base/bcol_base_init.c
+++ b/ompi/mca/bcol/base/bcol_base_init.c
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -20,13 +23,9 @@ int mca_bcol_base_init(bool enable_progress_threads, bool enable_mpi_threads)
 {
     mca_bcol_base_component_t *bcol_component;
     mca_base_component_list_item_t *cli;
-    opal_list_item_t *item;
     int ret;
 
-    for (item = opal_list_get_first((opal_list_t *) &mca_bcol_base_components_in_use);
-            opal_list_get_end((opal_list_t *) &mca_bcol_base_components_in_use) != item;
-            item = opal_list_get_next(item)) {
-        cli = (mca_base_component_list_item_t *) item;
+    OPAL_LIST_FOREACH(cli, &mca_bcol_base_components_in_use, mca_base_component_list_item_t) {
         bcol_component = (mca_bcol_base_component_t *) cli->cli_component;
 
         if (false == bcol_component->init_done) {
diff --git a/ompi/mca/bcol/basesmuma/Makefile.am b/ompi/mca/bcol/basesmuma/Makefile.am
index 6f80e17b8e..761d26ec47 100644
--- a/ompi/mca/bcol/basesmuma/Makefile.am
+++ b/ompi/mca/bcol/basesmuma/Makefile.am
@@ -18,14 +18,22 @@ sources = \
         bcol_basesmuma_module.c \
         bcol_basesmuma_buf_mgmt.c \
         bcol_basesmuma_mem_mgmt.c \
-        bcol_basesmuma_fanin.c \
+	bcol_basesmuma_fanin.c \
         bcol_basesmuma_fanout.c \
         bcol_basesmuma_progress.c \
+        bcol_basesmuma_reduce.h \
+        bcol_basesmuma_reduce.c \
+        bcol_basesmuma_allreduce.c \
         bcol_basesmuma_setup.c \
-        bcol_basesmuma_rk_barrier.c \
+	bcol_basesmuma_rd_barrier.c  \
         bcol_basesmuma_rd_nb_barrier.c \
+        bcol_basesmuma_rk_barrier.c \
         bcol_basesmuma_utils.c    \
         bcol_basesmuma_bcast_prime.c \
+        bcol_basesmuma_lmsg_knomial_bcast.c \
+        bcol_basesmuma_lmsg_bcast.c \
+        bcol_basesmuma_gather.c \
+        bcol_basesmuma_allgather.c \
         bcol_basesmuma_smcm.h \
         bcol_basesmuma_smcm.c 
 
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma.h
index c63ffa4d31..1c6f605bd5 100644
--- a/ompi/mca/bcol/basesmuma/bcol_basesmuma.h
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma.h
@@ -1,7 +1,10 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
- * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
- * Copyright (c) 2014 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2014      Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -77,28 +80,28 @@ OBJ_CLASS_DECLARATION(list_data_t);
 
 
 #define EXTRACT_FLAG(INPUT, OUTPUT, OUTPUT_TYPE, FIELD_BASE, FIELD_MASK) \
-        OUTPUT = (OUTPUT_TYPE) ( (INPUT SHIFT_DOWN FIELD_BASE ) & FIELD_MASK )
+    OUTPUT = (OUTPUT_TYPE) ( (INPUT SHIFT_DOWN FIELD_BASE ) & FIELD_MASK )
 
 #define STORE_FLAG(INPUT, OUTPUT, INPUT_TYPE, OUTPUT_TYPE, FIELD_BASE, INPLACE_FIELD_MASK ) \
-        OUTPUT =  \
-     ( \
-         /* 3 */ \
-         ( \
-             /* 2 */ \
-             ( \
-                 /* 1 - shift the input field to the proper location */ \
-                 (OUTPUT_TYPE)( \
-                   ((OUTPUT_TYPE)((INPUT_TYPE) (INPUT)))  \
-                        SHIFT_UP FIELD_BASE ) \
-                 /* mask off the extra bits */ \
-                  & ((OUTPUT_TYPE)INPLACE_FIELD_MASK)  \
-             ) \
-         /* store back to the OUTPUT field, w/o destroying other fields */ \
-         ) | OUTPUT \
-     )
+    OUTPUT =                                                            \
+        (                                                               \
+         /* 3 */                                                        \
+         (                                                              \
+          /* 2 */                                                       \
+          (                                                             \
+           /* 1 - shift the input field to the proper location */       \
+           (OUTPUT_TYPE)(                                               \
+                         ((OUTPUT_TYPE)((INPUT_TYPE) (INPUT)))          \
+                         SHIFT_UP FIELD_BASE )                          \
+           /* mask off the extra bits */                                \
+           & ((OUTPUT_TYPE)INPLACE_FIELD_MASK)                          \
+                                                                   )    \
+          /* store back to the OUTPUT field, w/o destroying other fields */ \
+                                                                          ) | OUTPUT \
+                                                                         )
 
-/** 
- * Structure to hold the basic shared memory bcoll component.  
+/**
+ * Structure to hold the basic shared memory bcoll component.
  */
 struct mca_bcol_basesmuma_component_t {
     /** Base coll component */
@@ -128,8 +131,8 @@ struct mca_bcol_basesmuma_component_t {
     size_t mpool_size;
 
 
-    /* mpool inited - will use this to test whether or not the 
-     * shared memory has been inited 
+    /* mpool inited - will use this to test whether or not the
+     * shared memory has been inited
      */
     bool mpool_inited;
 
@@ -137,7 +140,7 @@ struct mca_bcol_basesmuma_component_t {
      *   in shared memory */
     bcol_basesmuma_smcm_mmap_t *sm_ctl_structs;
 
-    /* shared memory payload buffer 
+    /* shared memory payload buffer
      */
     bcol_basesmuma_smcm_mmap_t *sm_payload_structs;
 
@@ -147,14 +150,14 @@ struct mca_bcol_basesmuma_component_t {
     opal_list_t ctl_structures;
 
 
-    /** opal list in which the list of peers that I am "connected" to is stored 
+    /** opal list in which the list of peers that I am "connected" to is stored
      */
     opal_list_t sm_connections_list;
 
     /* opal list in which the list of payload peers that I am "connected" to
      * is stored
      */
-    opal_list_t sm_payload_connections_list; 	
+    opal_list_t sm_payload_connections_list;
 
     /*
      * list of non-blocking admin barriers to progress */
@@ -172,7 +175,7 @@ struct mca_bcol_basesmuma_component_t {
     int radix_fanout;
 
     /*
-     * Order of read tree 
+     * Order of read tree
      */
     int radix_read_tree;
 
@@ -192,17 +195,17 @@ struct mca_bcol_basesmuma_component_t {
     int scatter_kary_radix;
 
     /*
-     * number of polling loops 
+     * number of polling loops
      */
     int num_to_probe;
 
-	/*
-	 * Portals addressing info
-	 * void*: because wanted to keep portal library dependencies 
-	 * as local as possible
-	 */
-	void *portals_info;
-	bool portals_init;
+    /*
+     * Portals addressing info
+     * void*: because wanted to keep portal library dependencies
+     * as local as possible
+     */
+    void *portals_info;
+    bool portals_init;
 
     /*
      * verbosity level
@@ -246,344 +249,400 @@ static inline int mca_bcol_basesmuma_err(const char* fmt, ...)
 }
 
 #if OPAL_ENABLE_DEBUG
-#define BASESMUMA_VERBOSE(level, args)                              \
-do {                                                     \
-    if(mca_bcol_basesmuma_component.verbose >= level) {         \
-        mca_bcol_basesmuma_err("[%s]%s[%s:%d:%s] BCOL-BASESMUMA ",     \
-                ompi_process_info.nodename,              \
-                OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),      \
-                __FILE__, __LINE__, __func__);           \
-        mca_bcol_basesmuma_err args;                            \
-        mca_bcol_basesmuma_err("\n");                           \
-    }                                                    \
-} while(0)
+#define BASESMUMA_VERBOSE(level, args)                                  \
+    do {                                                                \
+        if(mca_bcol_basesmuma_component.verbose >= level) {             \
+            mca_bcol_basesmuma_err("[%s]%s[%s:%d:%s] BCOL-BASESMUMA ",  \
+                                   ompi_process_info.nodename,          \
+                                   OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),  \
+                                   __FILE__, __LINE__, __func__);       \
+            mca_bcol_basesmuma_err args;                                \
+            mca_bcol_basesmuma_err("\n");                               \
+        }                                                               \
+    } while(0)
 #else
 #define BASESMUMA_VERBOSE(level, args)
 #endif
 
 
-/** 
+/**
  * Convenience typedef */
 typedef struct mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component_t;
 
 #if 0
-    /* 
-     * Implemented function index list 
-     */
+/*
+ * Implemented function index list
+ */
 
-    /* barrier */
-    enum{
-        FANIN_FAN_OUT_BARRIER_FN,
-        RECURSIVE_DOUBLING_BARRIER_FN,
-        N_BARRIER_FNS
-    };
+/* barrier */
+enum{
+    FANIN_FAN_OUT_BARRIER_FN,
+    RECURSIVE_DOUBLING_BARRIER_FN,
+    N_BARRIER_FNS
+};
 
-    /* reduce */
-    enum{
-        FANIN_REDUCE_FN,
-        REDUCE_SCATTER_GATHER_FN,
-        N_REDUCE_FNS
-    };
-    enum{
-        SHORT_DATA_FN_REDUCE,
-        LONG_DATA_FN_REDUCE,
-        N_REDUCE_FNS_USED
-    };
+/* reduce */
+enum{
+    FANIN_REDUCE_FN,
+    REDUCE_SCATTER_GATHER_FN,
+    N_REDUCE_FNS
+};
+enum{
+    SHORT_DATA_FN_REDUCE,
+    LONG_DATA_FN_REDUCE,
+    N_REDUCE_FNS_USED
+};
 
-    /* all-reduce */
-    enum{
-        FANIN_FANOUT_ALLREDUCE_FN,
-        REDUCE_SCATTER_ALLGATHER_FN,
-        N_ALLREDUCE_FNS
-    };
-    enum{
-        SHORT_DATA_FN_ALLREDUCE,
-        LONG_DATA_FN_ALLREDUCE,
-        N_ALLREDUCE_FNS_USED
-    };
+/* all-reduce */
+enum{
+    FANIN_FANOUT_ALLREDUCE_FN,
+    REDUCE_SCATTER_ALLGATHER_FN,
+    N_ALLREDUCE_FNS
+};
+enum{
+    SHORT_DATA_FN_ALLREDUCE,
+    LONG_DATA_FN_ALLREDUCE,
+    N_ALLREDUCE_FNS_USED
+};
 
 
-    /* enum for node type */
-    enum{
-        ROOT_NODE,
-        LEAF_NODE,
-        INTERIOR_NODE
-    };
+/* enum for node type */
+enum{
+    ROOT_NODE,
+    LEAF_NODE,
+    INTERIOR_NODE
+};
 
 
-    /*
-     * N-order tree node description
-     */
-    struct tree_node_t {
-        /* my rank within the group */
-        int my_rank;
-        /* my node type - root, leaf, or interior */
-        int my_node_type;
-        /* number of nodes in the tree */
-        int tree_size;
-        /* number of parents (0/1) */
-        int n_parents;
-        /* number of children */
-        int n_children;
-        /* parent rank within the group */
-        int parent_rank;
-        /* chidren ranks within the group */
-        int *children_ranks;
-    };
-    typedef struct tree_node_t tree_node_t;
+/*
+ * N-order tree node description
+ */
+struct tree_node_t {
+    /* my rank within the group */
+    int my_rank;
+    /* my node type - root, leaf, or interior */
+    int my_node_type;
+    /* number of nodes in the tree */
+    int tree_size;
+    /* number of parents (0/1) */
+    int n_parents;
+    /* number of children */
+    int n_children;
+    /* parent rank within the group */
+    int parent_rank;
+    /* chidren ranks within the group */
+    int *children_ranks;
+};
+typedef struct tree_node_t tree_node_t;
 
-    /*
-     * Pair-wise data exchange
-     */
-    /* enum for node type */
-    enum{
-        EXCHANGE_NODE,
-        EXTRA_NODE
-    };
+/*
+ * Pair-wise data exchange
+ */
+/* enum for node type */
+enum{
+    EXCHANGE_NODE,
+    EXTRA_NODE
+};
 
-    struct pair_exchange_node_t {
+struct pair_exchange_node_t {
 
-	/* my rank within the group */
-	int my_rank;
+    /* my rank within the group */
+    int my_rank;
 
-        /* number of nodes this node will exchange data with */
-        int n_exchanges;
+    /* number of nodes this node will exchange data with */
+    int n_exchanges;
 
-        /* ranks of nodes involved in data exchnge */
-        int *rank_exchanges;
+    /* ranks of nodes involved in data exchnge */
+    int *rank_exchanges;
 
-        /* number of extra sources of data - outside largest power of 2 in
-         *  this group */
-        int n_extra_sources;
-        
-        /* rank of the extra source */
-        int rank_extra_source;
+    /* number of extra sources of data - outside largest power of 2 in
+     *  this group */
+    int n_extra_sources;
 
-        /* number of tags needed per stripe */
-        int n_tags;
+    /* rank of the extra source */
+    int rank_extra_source;
 
-        /* log 2 of largest full power of 2 for this node set */
-        int log_2;
+    /* number of tags needed per stripe */
+    int n_tags;
 
-        /* largest power of 2 that fits in this group */
-        int n_largest_pow_2;
+    /* log 2 of largest full power of 2 for this node set */
+    int log_2;
 
-        /* node type */
-        int node_type;
+    /* largest power of 2 that fits in this group */
+    int n_largest_pow_2;
 
-    };
-    typedef struct pair_exchange_node_t pair_exchange_node_t;
+    /* node type */
+    int node_type;
+
+};
+typedef struct pair_exchange_node_t pair_exchange_node_t;
 #endif
-    /*
-     * descriptor for managing the admin nonblocking barrier routine.
-     *   This is an sm internal routine, and assumes only 1 outstanding
-     *   nb-barrier collective call per block.
+/*
+ * descriptor for managing the admin nonblocking barrier routine.
+ *   This is an sm internal routine, and assumes only 1 outstanding
+ *   nb-barrier collective call per block.
+ */
+/* forward declarations */
+struct mca_bcol_basesmuma_module_t;
+struct sm_buffer_mgmt;
+
+struct sm_nbbar_desc_t {
+    /* make sure we can put this on a list */
+    opal_list_item_t super;
+
+    /* phase of the collective operation - needed to know how to continue
+     * progressing the nb-barrier */
+    int collective_phase;
+
+    /* iteration to continue at */
+    int recursive_dbl_iteration;
+
+    /* pointer to the collective module this is associated with */
+    struct mca_bcol_basesmuma_module_t *sm_module;
+
+    /* pointer to payload/control structs buffers */
+    struct sm_buffer_mgmt *coll_buff;
+
+    /* pool index */
+    int pool_index;
+
+    /* pointer to the ml_memory_block_desc_t structure
+     * that is actually managing this registration.
+     * This is meaningful when these control structures
+     * are used in conjunction with the user payload
+     * data that is allocated at the ml level.
      */
-    /* forward declarations */
-    struct mca_bcol_basesmuma_module_t;
-    struct sm_buffer_mgmt;
+    void *ml_memory_block_descriptor;
 
-    struct sm_nbbar_desc_t {
-        /* make sure we can put this on a list */
-        opal_list_item_t super;
+};
+typedef struct sm_nbbar_desc_t sm_nbbar_desc_t;
 
-        /* phase of the collective operation - needed to know how to continue 
-         * progressing the nb-barrier */
-        int collective_phase;
+/*
+ * Barrier request objects
+ */
 
-        /* iteration to continue at */
-        int recursive_dbl_iteration;
+/* shared memory data strucutures */
+struct mca_bcol_basesmuma_nb_request_process_shared_mem_t {
+    volatile uint64_t coll_index;
+    /* flag used to indicate the status of this memory region */
+    volatile uint64_t flag;
+    volatile uint64_t index;
 
-        /* pointer to the collective module this is associated with */
-        struct mca_bcol_basesmuma_module_t *sm_module;
+    /* pading */
+    /* Note: need to change this so it takes less memory */
+    char padding[BASESMUMA_CACHE_LINE_SIZE-3*sizeof(uint64_t)];
+};
 
-	    /* pointer to payload/control structs buffers */
-        struct sm_buffer_mgmt *coll_buff;
+typedef struct mca_bcol_basesmuma_nb_request_process_shared_mem_t
+mca_bcol_basesmuma_nb_request_process_shared_mem_t;
 
-        /* pool index */
-        int pool_index;
+/* enum for phase at which the nb barrier is in */
+enum{
+    NB_BARRIER_INACTIVE,
 
-        /* pointer to the ml_memory_block_desc_t structure
-         * that is actually managing this registration.
-         * This is meaningful when these control structures
-         * are used in conjunction with the user payload
-         * data that is allocated at the ml level.
-         */
-         void *ml_memory_block_descriptor;
+    /* fan-in/fan-out */
+    NB_BARRIER_FAN_IN,
+    NB_BARRIER_FAN_OUT,
 
-    };
-    typedef struct sm_nbbar_desc_t sm_nbbar_desc_t;
+    /* recursive doubling */
+    NB_PRE_PHASE,
+    NB_RECURSIVE_DOUBLING,
+    NB_POST_PHASE,
 
-    /*
-     * Barrier request objects
+    /* done and not started are the same for all practicle
+     * purposes, as the init funtion always sets this flag
      */
-
-    /* shared memory data strucutures */
-    struct mca_bcol_basesmuma_nb_request_process_shared_mem_t {
-        volatile uint64_t coll_index;
-        /* flag used to indicate the status of this memory region */
-        volatile uint64_t flag;
-        volatile uint64_t index;
-
-        /* pading */
-        /* Note: need to change this so it takes less memory */
-        char padding[BASESMUMA_CACHE_LINE_SIZE-3*sizeof(uint64_t)];
-    };
-
-    typedef struct mca_bcol_basesmuma_nb_request_process_shared_mem_t
-        mca_bcol_basesmuma_nb_request_process_shared_mem_t;
-
-    /* enum for phase at which the nb barrier is in */
-    enum{
-        NB_BARRIER_INACTIVE,
-
-        /* fan-in/fan-out */
-        NB_BARRIER_FAN_IN,
-        NB_BARRIER_FAN_OUT,
-
-        /* recursive doubling */
-        NB_PRE_PHASE,
-        NB_RECURSIVE_DOUBLING,
-        NB_POST_PHASE,
-
-        /* done and not started are the same for all practicle
-         * purposes, as the init funtion always sets this flag
-         */
-        NB_BARRIER_DONE
-    };
+    NB_BARRIER_DONE
+};
 
 
 
-    /* forward declartion */
-    struct mca_bcol_basesmuma_module_t;
+/* forward declartion */
+struct mca_bcol_basesmuma_module_t;
+
+struct mca_basesmuma_ctrl_4_hdl_t {
+    int fd;
+    bool status;
+    volatile char buf[128];
+    /*volatile char buf[OPAL_PATH_MAX];*/
+};
+typedef struct mca_basesmuma_ctrl_4_hdl_t mca_basesmuma_ctrl_4_hdl_t;
+
+/* control segment for shared memory */
+struct mca_bcol_basesmuma_ctl_struct_t {
+    /* collective identifier */
+    volatile int64_t sequence_number;
+    volatile int64_t flag;
+    volatile int64_t index;
+    volatile int64_t offset;
+    volatile int64_t offset_zip;
 
 
-    /* control segment for shared memory */
-    struct mca_bcol_basesmuma_ctl_struct_t {
-	    /* collective identifier */
-	    volatile int64_t sequence_number;
-	    volatile int64_t flag;
-        volatile int64_t index;
-        volatile int64_t offset;
-        volatile int64_t offset_zip;
-        
+    /* used for non-blocking algorithms */
+    int status;
+    int active_requests;
+    int iteration;
 
-        /* used for non-blocking algorithms */
-        int status;
-        int active_requests;
-        int iteration;
+    int *src_ptr;
 
-        int *src_ptr;
-        
-        int start;
+    int start;
 
-        /* process private data */
-        int starting_flag_value;
+    /* process private data */
+    int starting_flag_value;
 
-        /* experiment for large data colls */
-        int n_sends;
-        int length;
+    /* experiment for large data colls */
+    int n_sends;
+    int length;
 
+    /* hdl framework control structure*/
+    /* no need to pad at this level anymore */
+    volatile int64_t data_hdl;
+    volatile mca_basesmuma_ctrl_4_hdl_t hdl_ctrl;
 
 #ifdef __PORTALS_AVAIL__
-		struct mca_bcol_basesmuma_portal_buf_addr_t portals_buf_addr;	
+    struct mca_bcol_basesmuma_portal_buf_addr_t portals_buf_addr;
 #endif
-	    /* padding */
-        /* ok, no room to pad anymore */
-	    /*char padding[BASESMUMA_CACHE_LINE_SIZE-5*sizeof(int64_t)-8*sizeof(int)];*/
-    };
-    typedef struct mca_bcol_basesmuma_ctl_struct_t mca_bcol_basesmuma_ctl_struct_t;
+    /* padding */
+    /*char padding[BASESMUMA_CACHE_LINE_SIZE-4*sizeof(uint64_t)-3*sizeof(int)];*/
+    char padding[BASESMUMA_CACHE_LINE_SIZE-6*sizeof(int64_t)-5*sizeof(int)];
+};
+typedef struct mca_bcol_basesmuma_ctl_struct_t mca_bcol_basesmuma_ctl_struct_t;
 
 
 #define SM_BCOLS_MAX 2
 
-    enum {
-        ALLGATHER_FLAG,
-        ALLREDUCE_FLAG,
-        BARRIER_FANIN_FLAG,
-        BARRIER_FANOUT_FLAG,
-        BARRIER_RKING_FLAG,
-        BCAST_FLAG,
-        GATHER_FLAG,
-        REDUCE_FLAG,
-        NUM_SIGNAL_FLAGS
-    };
+/* enum for signaling flag bank, when
+ * adding to this list, please keep
+ * it alphabetical
+ */
+enum {
+    ALLGATHER_FLAG,
+    ALLREDUCE_FLAG,
+    BARRIER_FANIN_FLAG,
+    BARRIER_FANOUT_FLAG,
+    BARRIER_RKING_FLAG,
+    BCAST_FLAG,
+    GATHER_FLAG,
+    REDUCE_FLAG,
+    NUM_SIGNAL_FLAGS
+};
 
-    
-    /* control region for colls with user data - shared memory */
-    struct mca_bcol_basesmuma_header_t {
-        /* collective identifier */
-        volatile int64_t sequence_number;
-        volatile int8_t  flags[NUM_SIGNAL_FLAGS][SM_BCOLS_MAX];
-        volatile int32_t src; /* src of bcast data for unknown root, 
-                                 bcol id for known root
-                               */
-        /* starting flag - hierarchies */
-        int8_t starting_flag_value[SM_BCOLS_MAX];
 
-    };
-    typedef struct mca_bcol_basesmuma_header_t mca_bcol_basesmuma_header_t;
+/* control region for colls with user data - shared memory */
+struct mca_bcol_basesmuma_header_t {
+    /* collective identifier */
+    volatile int64_t sequence_number;
+    volatile int8_t  flags[NUM_SIGNAL_FLAGS][SM_BCOLS_MAX];
+    volatile int32_t src; /* src of bcast data for unknown root,
+                             bcol id for known root
+                          */
+    /* starting flag - hierarchies */
+    int8_t starting_flag_value[SM_BCOLS_MAX];
+    int8_t ready_flag;
 
-    /* data needed for large messages */
-    struct mca_bcol_basesmuma_large_msg_t {
-        /* scatter allgather data */
-        uint64_t offset;
-        uint64_t n_sends;
-        uint64_t length;
+    /* Manju: Cached array of receive buffer offsets
+     *
+     * This array stores the receive buffer offsets (rbuf_offsets) of data buffer.
+     * In general, we assume that sbuf_offset and rbuf_offset of
+     * processes invoking the collective primitive is same. This is
+     * true when the order in which processes invoke their hierarchies are
+     * same.
+     *
+     * For some algorithms (like broadcast, reduce)  we split the ML buffer
+     * and use first half as
+     * source and second half as receive buffer. We swap these buffers for
+     * each change when we change levels i.e., if first half is source for
+     * level 1, in the level 2 of hierarchy it becomes the receive buffer.
+     * For reduce algorithm, each process can invoke hierarchies
+     * (primitives) in different order. For example, process 1 might have level 1 as SM
+     * and level 2 as p2p, and process 2 might have different order where its
+     * level 1 is p2p and level 2 SM. In this case, if in basesmuma reduce
+     * algorithm, if parent assumes its rbuf_offset as child's rbuf_offset
+     * it is wrong. So we cache rbuf_offset of each process so
+     * it could be accessed by processes to obtain the data.
+     */
 
-        /* portals data */
+    volatile int32_t roffsets[SM_BCOLS_MAX];
 
-    };
-    typedef struct mca_bcol_basesmuma_large_msg_t mca_bcol_basesmuma_large_msg_t;
+    /* Manju Start: Experimental ctl fields and should be removed later;
+     * This is used for lmsg reduce for testing
+     * during transition to HDL primitives
+     */
+#if 0
+    int lmsg_reduce_snd_completes;
+    /* There can be atmost 20 ranks in the subgroup. Since this
+     * only for testing this should be good enough */
+    int lmsg_reduce_peers[20];
+    int lmsg_reduce_send_offsets[20];
+    /* End: Experimental ctl fields */
 
-    /* payload struct */
-	struct mca_bcol_basesmuma_payload_t {
 
-		/* base pointer to shared memory control structure */
-		mca_bcol_basesmuma_header_t *ctl_struct;
-        void *payload;
-        
-	};
+    /* no need to pad at this level anymore */
+    volatile int64_t data_hdl;
+#endif
+};
+typedef struct mca_bcol_basesmuma_header_t mca_bcol_basesmuma_header_t;
 
-	typedef struct mca_bcol_basesmuma_payload_t mca_bcol_basesmuma_payload_t;
+/* data needed for large messages */
+struct mca_bcol_basesmuma_large_msg_t {
+    /* scatter allgather data */
+    uint64_t offset;
+    uint64_t n_sends;
+    uint64_t length;
+
+    /* portals data */
+
+};
+typedef struct mca_bcol_basesmuma_large_msg_t mca_bcol_basesmuma_large_msg_t;
+
+/* payload struct */
+struct mca_bcol_basesmuma_payload_t {
+
+    /* base pointer to shared memory control structure */
+    mca_bcol_basesmuma_header_t *ctl_struct;
+    void *payload;
+
+};
+
+typedef struct mca_bcol_basesmuma_payload_t mca_bcol_basesmuma_payload_t;
 
 
 
 
-    /* memory bank memory management structure */
-    struct mem_bank_management_t {
+/* memory bank memory management structure */
+struct mem_bank_management_t {
 
-        /* generation counter */
-        uint64_t bank_gen_counter;
+    /* generation counter */
+    uint64_t bank_gen_counter;
 
-        /* descriptor for the non-blocking barrier.  This is
-         *  used to manage this bank of memory.
-         */
-        sm_nbbar_desc_t nb_barrier_desc;
+    /* descriptor for the non-blocking barrier.  This is
+     *  used to manage this bank of memory.
+     */
+    sm_nbbar_desc_t nb_barrier_desc;
 
-        /* the number of buffers that are not in use, and are
-         * available.  The assumption is that the buffers are
-         * recycled all at once, so are available for re-use
-         * until all buffers have been made available for re-use.
-         */
-        volatile int available_buffers;
+    /* the number of buffers that are not in use, and are
+     * available.  The assumption is that the buffers are
+     * recycled all at once, so are available for re-use
+     * until all buffers have been made available for re-use.
+     */
+    volatile int available_buffers;
 
-        /*
-         * number of buffers freed */
-        volatile int n_buffs_freed;
+    /*
+     * number of buffers freed */
+    volatile int n_buffs_freed;
 
-        /* mutex to ensure atomic recycling of resrouces */
-        opal_mutex_t mutex;
+    /* mutex to ensure atomic recycling of resrouces */
+    opal_mutex_t mutex;
 
-        /* number of buffers being managed */
-        int number_of_buffers;
+    /* number of buffers being managed */
+    int number_of_buffers;
 
-        /* shared memory control structures */
-        int index_shared_mem_ctl_structs;
+    /* shared memory control structures */
+    int index_shared_mem_ctl_structs;
 
 
-    };
-    typedef struct mem_bank_management_t mem_bank_management_t;
+};
+typedef struct mem_bank_management_t mem_bank_management_t;
 
 /* data structure for shared buffers */
 struct sm_buffer_mgmt {
@@ -614,8 +673,8 @@ struct sm_buffer_mgmt {
      */
     volatile void **ctl_buffs;
 
-    /* management data for the control structures - 
-     * one per bank of control structures - Will be used for 
+    /* management data for the control structures -
+     * one per bank of control structures - Will be used for
      * the payload buffers as well.
      */
     mem_bank_management_t *ctl_buffs_mgmt;
@@ -627,54 +686,54 @@ struct sm_buffer_mgmt {
 
     volatile mca_bcol_basesmuma_payload_t *data_buffs;
 
-    
+
 
 };
 typedef struct sm_buffer_mgmt sm_buffer_mgmt;
 
 
 struct mca_bcol_basesmuma_nb_coll_buff_desc_t {
-    void     *data_addr;            
-    uint64_t     bank_index;        
-    uint64_t     buffer_index;      
-    int       active_requests;   
-    ompi_request_t **requests;      
-    int          data_src;           
-    int          radix_mask;         
-    int          radix_mask_pow;     
-    int          iteration;         
-    int          status;     
-   	/* this is for testing */
-	int 		tag;	
+    void     *data_addr;
+    uint64_t     bank_index;
+    uint64_t     buffer_index;
+    int       active_requests;
+    ompi_request_t **requests;
+    int          data_src;
+    int          radix_mask;
+    int          radix_mask_pow;
+    int          iteration;
+    int          status;
+    /* this is for testing */
+    int                 tag;
 
-	volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+    volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
     volatile mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer;
-	volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer; 
-	volatile mca_bcol_basesmuma_ctl_struct_t  *extra_partner_ctl_pointer; 
+    volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer;
+    volatile mca_bcol_basesmuma_ctl_struct_t  *extra_partner_ctl_pointer;
 };
 
 typedef struct mca_bcol_basesmuma_nb_coll_buff_desc_t mca_bcol_basesmuma_nb_coll_buff_desc_t;
 
 struct mca_bcol_basesmuma_local_mlmem_desc_t {
 
-	uint32_t bank_index_for_release;
+    uint32_t bank_index_for_release;
     struct ml_memory_block_desc_t *ml_mem_desc;
     uint32_t     num_banks;
     uint32_t     num_buffers_per_bank;
     uint32_t     size_buffer;
     uint32_t     *bank_release_counter;
 
-	/*
-	 * Number of descriptors allocated is equivalent to number of ml buffers
-	 * (number of banks * number of buffers per bank) 
-	 */
-	mca_bcol_basesmuma_nb_coll_buff_desc_t *nb_coll_desc;
+    /*
+     * Number of descriptors allocated is equivalent to number of ml buffers
+     * (number of banks * number of buffers per bank)
+     */
+    mca_bcol_basesmuma_nb_coll_buff_desc_t *nb_coll_desc;
 };
 
 typedef struct mca_bcol_basesmuma_local_mlmem_desc_t mca_bcol_basesmuma_local_mlmem_desc_t;
 
 #ifdef __PORTALS_AVAIL__
-#define MAX_SM_GROUP_SIZE 32 
+#define MAX_SM_GROUP_SIZE 32
 
 
 struct portals_scatter_allgather_nb_bcast_state_t
@@ -684,52 +743,52 @@ struct portals_scatter_allgather_nb_bcast_state_t
     int my_rank, src, matched;
     int src_list[MAX_SM_GROUP_SIZE];
     int group_size;
-	int64_t ready_flag;
+    int64_t ready_flag;
     int pow_2, pow_2_levels;
     int src_list_index;
     uint64_t fragment_size;  /* user buffer size */
 
-	/* Input argument variables */
-	void *my_userbuf;
-	int64_t sequence_number;
+    /* Input argument variables */
+    void *my_userbuf;
+    int64_t sequence_number;
 
-	/* Extra source variables */
-	bool secondary_root;
-	int partner , extra_partner;
+    /* Extra source variables */
+    bool secondary_root;
+    int partner , extra_partner;
 
-	/* Scatter Allgather offsets */
-	uint64_t local_sg_offset , global_sg_offset , partner_offset ;
+    /* Scatter Allgather offsets */
+    uint64_t local_sg_offset , global_sg_offset , partner_offset ;
 
-	/* Portals messaging relevant variables */
-	/* 
-	 * ptl_handle_eq_t allgather_eq_h;
-	 */
-	ptl_handle_eq_t read_eq;
-	ptl_event_t  allgather_event;
-	bool msg_posted;
+    /* Portals messaging relevant variables */
+    /*
+     * ptl_handle_eq_t allgather_eq_h;
+     */
+    ptl_handle_eq_t read_eq;
+    ptl_event_t  allgather_event;
+    bool msg_posted;
 
-	/* OMPI module and component variables */
+    /* OMPI module and component variables */
     mca_bcol_basesmuma_component_t *cs;
-    struct mca_bcol_basesmuma_module_t *bcol_module; 
+    struct mca_bcol_basesmuma_module_t *bcol_module;
 
-	/* Control structure and payload variables */
-	volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+    /* Control structure and payload variables */
+    volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
     volatile mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer;
-	volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer; /* scatter source */
-	volatile mca_bcol_basesmuma_ctl_struct_t  *extra_partner_ctl_pointer; /* scatter source */
+    volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer; /* scatter source */
+    volatile mca_bcol_basesmuma_ctl_struct_t  *extra_partner_ctl_pointer; /* scatter source */
 
-	int phase;
+    int phase;
 };
 
 
 typedef struct portals_scatter_allgather_nb_bcast_state_t sg_state_t;
 #endif
 
-#define SM_ARRAY_INDEX(LEAD_DIM,BUF_INDEX,PROC_INDEX) \
+#define SM_ARRAY_INDEX(LEAD_DIM,BUF_INDEX,PROC_INDEX)   \
     ((LEAD_DIM)*(BUF_INDEX)+(PROC_INDEX))
-        /* debug */
+/* debug */
 #define BARRIER_BANK_LIST_SIZE 32
-        /* end debug */
+/* end debug */
 
 struct mca_bcol_basesmuma_module_t {
     /* base structure */
@@ -796,9 +855,9 @@ struct mca_bcol_basesmuma_module_t {
     int scatter_kary_radix;
     netpatterns_tree_node_t *scatter_kary_tree;
 
-	/* Knomial exchange tree */
-	/* Currently used for only large message reduce */
-	netpatterns_k_exchange_node_t knomial_exchange_tree;
+    /* Knomial exchange tree */
+    /* Currently used for only large message reduce */
+    netpatterns_k_exchange_node_t knomial_exchange_tree;
 
     /* sequence number offset - want to make sure that we start
      *   id'ing collectives with id 0, so we can have simple
@@ -807,7 +866,7 @@ struct mca_bcol_basesmuma_module_t {
     int64_t squence_number_offset;
 
     /* basesmuma specific header size into ml buffer
-     * was calculated at ml level - it is the sum of 
+     * was calculated at ml level - it is the sum of
      * all headers from all bcols and then aligned to
      * whatever alignment was requested
      */
@@ -819,7 +878,7 @@ struct mca_bcol_basesmuma_module_t {
     /* Number of possible sources */
     int src_size;
 
-    /* smallest power of k that is smaller 
+    /* smallest power of k that is smaller
      * than or equal in size to the uma group
      */
     int pow_k_levels;
@@ -827,8 +886,8 @@ struct mca_bcol_basesmuma_module_t {
     /* size of power-of-k group */
     int pow_k;
 
-    /* smallest power of 2 that is smaller 
-     * than or equal to the smuma group size 
+    /* smallest power of 2 that is smaller
+     * than or equal to the smuma group size
      */
     int pow_2_levels;
 
@@ -841,14 +900,14 @@ struct mca_bcol_basesmuma_module_t {
     void **shared_memory_scratch_space;
 
     /*
-	 * Caching information for re-entrant collectives
-	 */
-	mca_bcol_basesmuma_local_mlmem_desc_t ml_mem;
+     * Caching information for re-entrant collectives
+     */
+    mca_bcol_basesmuma_local_mlmem_desc_t ml_mem;
 
-	/*
-	 * Cached offsets for lmsg reduce
-	 */ 
-	int **reduce_offsets;
+    /*
+     * Cached offsets for lmsg reduce
+     */
+    int **reduce_offsets;
 
     /*XXX:
      * Starting to explore the beauty of zero-copy for large message
@@ -856,18 +915,18 @@ struct mca_bcol_basesmuma_module_t {
     struct mca_hdl_base_module_t **hdl_module;
 
 #ifdef __PORTALS_AVAIL__
-	/*
-	 * Store state for NB blocking functions 
-	 */
-	sg_state_t sg_state;
+    /*
+     * Store state for NB blocking functions
+     */
+    sg_state_t sg_state;
 
-#endif 
+#endif
 };
 
 typedef struct mca_bcol_basesmuma_module_t mca_bcol_basesmuma_module_t;
 OBJ_CLASS_DECLARATION(mca_bcol_basesmuma_module_t);
 
-/* shared memory specific arguments for the bcol registration function */	
+/* shared memory specific arguments for the bcol registration function */
 typedef struct bcol_basesmuma_registration_data_t {
     char *file_name; /* filename for payload */
     void *base_addr; /* base address to be mapped */
@@ -875,30 +934,11 @@ typedef struct bcol_basesmuma_registration_data_t {
     size_t size_ctl_structure;
     size_t data_seg_alignment;
     bcol_basesmuma_smcm_mmap_t *sm_mmap; /* shared memory map struct */
-    mca_coll_ml_release_buff_fn_t buff_release_cb; /* buffer release 
+    mca_coll_ml_release_buff_fn_t buff_release_cb; /* buffer release
                                                       call back */
-} bcol_basesmuma_registration_data_t;	
+} bcol_basesmuma_registration_data_t;
 
 
-/* enum for signaling flag bank, when 
- * adding to this list, please keep 
- * it alphabetical
- */
-/*
-enum {
-    ALLGATHER_FLAG,
-    ALLREDUCE_FLAG,
-    BARRIER_FANIN_FLAG,
-    BARRIER_FANOUT_FLAG,
-    BARRIER_RKING_FLAG,
-    BCAST_FLAG,
-    GATHER_FLAG,
-    SCATTER_FLAG,
-    NUM_SIGNAL_FLAGS
-};
-
-*/
-
 enum {
     BUFFER_AVAILABLE,
     STARTED,
@@ -906,23 +946,23 @@ enum {
     FANOUT
 };
 
-/* enum used for non-blocking large 
- * message bcast 
+/* enum used for non-blocking large
+ * message bcast
  */
 
 enum {
-   INIT,
-   START,
-   NOT_STARTED,
-   SCATTER,
-   ALLGATHER,
-   EXTRA_RANK,
-   PROBE,
-   SCATTER_ROOT_WAIT,
-   SCATTER_EXTRA_ROOT_WAIT,
-   SCATTER_PARENT_WAIT,
-   FINISHED
-}; 
+    INIT,
+    START,
+    NOT_STARTED,
+    SCATTER,
+    ALLGATHER,
+    EXTRA_RANK,
+    PROBE,
+    SCATTER_ROOT_WAIT,
+    SCATTER_EXTRA_ROOT_WAIT,
+    SCATTER_PARENT_WAIT,
+    FINISHED
+};
 
 /**
  * Global component instance
@@ -938,7 +978,7 @@ OMPI_MODULE_DECLSPEC extern mca_bcol_basesmuma_component_t mca_bcol_basesmuma_co
  * satisfy the thread and progress requirements
  */
 int mca_bcol_basesmuma_init_query(bool enable_progress_threads,
-        bool enable_mpi_threads);
+                                  bool enable_mpi_threads);
 
 /* query to see if the module is available for use on the given
  * communicator, and if so, what it's priority is.
@@ -950,7 +990,7 @@ mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules);
 
 /* shared memory specific memory registration function - this will be passed into the mpool */
 int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size,
-	    void **reg);	
+                                   void **reg);
 
 /* shared memory specific memory deregistration function - also needed by the mpool */
 int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg);
@@ -959,14 +999,14 @@ int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg);
 int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super);
 
 /* allocate the memory pool for the shared memory control structures */
-int mca_bcol_basesmuma_allocate_pool_memory(mca_bcol_basesmuma_component_t 
-		*component);
+int mca_bcol_basesmuma_allocate_pool_memory(mca_bcol_basesmuma_component_t
+                                            *component);
 
-/* initialize the internal scratch buffers and control structs that will be 
+/* initialize the internal scratch buffers and control structs that will be
    used by the module */
 int base_bcol_basesmuma_setup_library_buffers(
-		 		mca_bcol_basesmuma_module_t *sm_module,
-		 		mca_bcol_basesmuma_component_t *cs);
+                                              mca_bcol_basesmuma_module_t *sm_module,
+                                              mca_bcol_basesmuma_component_t *cs);
 
 
 /* shared memory recursive doubling initialization */
@@ -974,7 +1014,7 @@ int bcol_basesmuma_rd_barrier_init(mca_bcol_base_module_t *module);
 
 /* shared memory recusive double barrier */
 int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                            coll_ml_function_t *c_input_args);
 /* shared memory fanin */
 int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super);
 
@@ -988,18 +1028,18 @@ int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super);
 int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super);
 
 int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args);
+                         coll_ml_function_t *c_input_args);
 
 /* Shared memory non-blocking broadcast */
 int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                          coll_ml_function_t *c_input_args);
 
 int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                            coll_ml_function_t *c_input_args);
 
 /* Shared memory non-blocking broadcast - Large message anyroot */
 int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
-            coll_ml_function_t *c_input_args);
+                                                    coll_ml_function_t *c_input_args);
 
 #if 0
 /*FIXME: having fun here*/
@@ -1008,87 +1048,87 @@ int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
 #endif
 
 int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args);
+                                               coll_ml_function_t *c_input_args);
 
 int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args);
+                                                        coll_ml_function_t *c_input_args);
 
 int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args);
+                                                           coll_ml_function_t *c_input_args);
 
 int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args);
+                                                                     coll_ml_function_t *c_input_args);
 
 /*
- *  shared memory scatter 
+ *  shared memory scatter
  */
 int bcol_basesmuma_scatter_init(mca_bcol_base_module_t *super);
 
 /* shared memory nonblocking scatter - known root */
 int bcol_basesmuma_nb_scatter_k_array_knownroot(
-        bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                                bcol_function_args_t *input_args,
+                                                coll_ml_function_t *c_input_args);
 
 /* shared memory non-blocking k-nomial barrier init */
 int bcol_basesmuma_k_nomial_barrier_init(bcol_function_args_t *input_args,
-                                struct coll_ml_function_t *const_args);
+                                         struct coll_ml_function_t *const_args);
 
 /* shared memory non-blocking k-nomial barrier progress */
 int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args,
-                                struct coll_ml_function_t *const_args);
+                                             struct coll_ml_function_t *const_args);
 
 /*shared memory non-blocking k-nomial allgather init */
 int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
-                        struct coll_ml_function_t *const_args);
+                                           struct coll_ml_function_t *const_args);
 
 /* shared memory non-blocking k-nomial allgather progress */
 int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args,
-                        struct coll_ml_function_t *const_args);
+                                               struct coll_ml_function_t *const_args);
 
 /* shared memory allgather -- selection logic api */
 int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super);
 
 /* shared memory blocking k-nomial gather */
 int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                   coll_ml_function_t *c_input_args);
 
 /* shared memory non blocking k-nomial gather */
 int bcol_basesmuma_k_nomial_gather_init(bcol_function_args_t *input_args,
-                coll_ml_function_t *c_input_args);
+                                        coll_ml_function_t *c_input_args);
 
 /* shared memory non blocking k-nomial gather progress*/
 int bcol_basesmuma_k_nomial_gather_progress(bcol_function_args_t *input_args,
-                coll_ml_function_t *c_input_args);
+                                            coll_ml_function_t *c_input_args);
 
 /* shared memory init */
 int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super);
- 
+
 /* allocate shared memory control memory */
 int mca_bcol_basesmuma_allocate_sm_ctl_memory(
-        mca_bcol_basesmuma_component_t *cs);
+                                              mca_bcol_basesmuma_component_t *cs);
 
 /* Shared memory basesmuma reduce */
 int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super);
 int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                      coll_ml_function_t *c_input_args);
 int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                          coll_ml_function_t *c_input_args);
 
-int bcol_basesmuma_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf, 
-        int count, struct ompi_datatype_t *dtype, 
-        struct ompi_op_t *op, 
-        int root,
-        struct ompi_communicator_t *comm,
-        mca_coll_base_module_t *module);
+int bcol_basesmuma_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf,
+                                                     int count, struct ompi_datatype_t *dtype,
+                                                     struct ompi_op_t *op,
+                                                     int root,
+                                                     struct ompi_communicator_t *comm,
+                                                     mca_coll_base_module_t *module);
 
 /* Shared memory basesmuma allreduce */
 int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super);
 
 int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args,
-        coll_ml_function_t *c_input_args);
+                                                coll_ml_function_t *c_input_args);
 
-int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args, 
-													  coll_ml_function_t *c_input_args);
+int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
+                                                      coll_ml_function_t *c_input_args);
 
 /* initialize non-blocking barrier for recycling the memory buffers.
  *  This is not a general purpose nb_barrier, and relies on the
@@ -1105,48 +1145,48 @@ int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super);
 
 /* smcm allgather function used to exchange file offsets. */
 int bcol_basesmuma_smcm_allgather_connection(
-        mca_bcol_basesmuma_module_t *sm_bcol_module,
-        mca_sbgp_base_module_t *module,
-        opal_list_t *peer_list,
-        bcol_basesmuma_smcm_proc_item_t ***backing_files,
-        ompi_communicator_t *comm,
-        bcol_basesmuma_smcm_file_t input, char *base_fname,
-        bool map_all);
+                                             mca_bcol_basesmuma_module_t *sm_bcol_module,
+                                             mca_sbgp_base_module_t *module,
+                                             opal_list_t *peer_list,
+                                             bcol_basesmuma_smcm_proc_item_t ***backing_files,
+                                             ompi_communicator_t *comm,
+                                             bcol_basesmuma_smcm_file_t input, char *base_fname,
+                                             bool map_all);
 
 /*
  * this function initializes the internal scratch buffers and control
- * structures that will be used by the module 
+ * structures that will be used by the module
  */
 int base_bcol_masesmuma_setup_library_buffers(
-        mca_bcol_basesmuma_module_t *sm_bcol_module, 
-        mca_bcol_basesmuma_component_t *sm_bcol_component);
+                                              mca_bcol_basesmuma_module_t *sm_bcol_module,
+                                              mca_bcol_basesmuma_component_t *sm_bcol_component);
 
 /* get the index of the shared memory buffer to be used */
 int bcol_basesmuma_get_buff_index( sm_buffer_mgmt * buff_block,
-    uint64_t buff_id );
+                                   uint64_t buff_id );
 
 int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
-    uint64_t buff_id );
+                              uint64_t buff_id );
 
-/* This function does bcol_basesmuma specific memory registration and 
+/* This function does bcol_basesmuma specific memory registration and
    issues call back for ml level bank recycling
-  */ 
+*/
 int bcol_basesmuma_bank_init(struct mca_coll_ml_module_t *ml_module,
-		mca_bcol_base_module_t *bcol_module,
-		void *reg_data);
+                             mca_bcol_base_module_t *bcol_module,
+                             void *reg_data);
 
-/* bank init which is used for shared memory optimization, fall back to 
- * the bank init above if this causes problems 
+/* bank init which is used for shared memory optimization, fall back to
+ * the bank init above if this causes problems
  */
 int bcol_basesmuma_bank_init_opti(struct mca_coll_ml_module_t *ml_module,
-		mca_bcol_base_module_t *bcol_module,
-		void *reg_data);
+                                  mca_bcol_base_module_t *bcol_module,
+                                  void *reg_data);
 
 /* used for shared memory offset exchange */
 int base_bcol_basesmuma_exchange_offsets(
-    mca_bcol_basesmuma_module_t *sm_bcol_module,
-    void **result_array, uint64_t mem_offset, int loop_limit,
-    int leading_dim);
+                                         mca_bcol_basesmuma_module_t *sm_bcol_module,
+                                         void **result_array, uint64_t mem_offset, int loop_limit,
+                                         int leading_dim);
 
 
 /* the progress function to be called from the opal progress function
@@ -1154,63 +1194,63 @@ int base_bcol_basesmuma_exchange_offsets(
 int bcol_basesmuma_progress(void);
 
 /* Macro for initializing my shared memory control structure */
-#define BASESMUMA_HEADER_INIT(my_ctl_pointer,ready_flag, sequence_number, bcol_id) \
-do{                                                                                \
-    int i,j;                                                                       \
-    int8_t flag_offset = 0;                                                        \
-    /* setup resource recycling */                                                 \
-    if( my_ctl_pointer->sequence_number < sequence_number ) {                      \
-        /* Signal arrival */                                                       \
-        for( j = 0; j < SM_BCOLS_MAX; j++){                                        \
-            my_ctl_pointer->starting_flag_value[j]=0;                              \
-            for( i = 0; i < NUM_SIGNAL_FLAGS; i++){                                \
-                my_ctl_pointer->flags[i][j] = -1;                                  \
-            }                                                                      \
-        }                                                                          \
-    }                                                                              \
-    /* increment the starting flag by one and return */                            \
-    flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];                    \
-    ready_flag = flag_offset + 1;                                                  \
-    MB();                                                                          \
-    my_ctl_pointer->sequence_number = sequence_number;                             \
-}while(0)
+#define BASESMUMA_HEADER_INIT(my_ctl_pointer,ready_flag, seqn, bcol_id) \
+    do{                                                                 \
+        int i,j;                                                        \
+        int8_t flag_offset = 0;                                         \
+        /* setup resource recycling */                                  \
+        if( (my_ctl_pointer)->sequence_number < (seqn) ) {              \
+            /* Signal arrival */                                        \
+            for( j = 0; j < SM_BCOLS_MAX; j++){                         \
+                (my_ctl_pointer)->starting_flag_value[j]=0;             \
+                for( i = 0; i < NUM_SIGNAL_FLAGS; i++){                 \
+                    (my_ctl_pointer)->flags[i][j] = -1;                 \
+                }                                                       \
+            }                                                           \
+        }                                                               \
+        /* increment the starting flag by one and return */             \
+        flag_offset = (my_ctl_pointer)->starting_flag_value[(bcol_id)]; \
+        (ready_flag) = flag_offset + 1;                                 \
+        opal_atomic_wmb();                                              \
+        (my_ctl_pointer)->sequence_number = (seqn);                     \
+    }while(0)
 
 /* these are all the same, am using a single macro for all collectives */
 
-#define IS_PEER_READY(peer, my_flag, my_sequence_number,flag_index, bcol_id)\
-		(((peer)->sequence_number == (my_sequence_number) &&                \
-		   (peer)->flags[flag_index][bcol_id] >= (my_flag))? true : false )
+#define IS_PEER_READY(peer, my_flag, my_sequence_number,flag_index, bcol_id) \
+    (((peer)->sequence_number == (my_sequence_number) && \
+      (peer)->flags[flag_index][bcol_id] >= (my_flag))? true : false )
 
 #if 0
-#define IS_AR_DATA_READY(peer, my_flag, my_sequence_number)\
-		(((peer)->sequence_number == (my_sequence_number) && \
-		   (peer)->flags[ALLREDUCE_FLAG][bcol_id] >= (my_flag) \
-	   	 )? true : false )
+#define IS_AR_DATA_READY(peer, my_flag, my_sequence_number)     \
+    (((peer)->sequence_number == (my_sequence_number) &&        \
+      (peer)->flags[ALLREDUCE_FLAG][bcol_id] >= (my_flag)       \
+      )? true : false )
 
-#define IS_GDATA_READY(peer, my_flag, my_sequence_number)\
-		(((peer)->sequence_number == (my_sequence_number) && \
-		   (peer)->flags[GATHER_FLAG][bcol_id] == (my_flag) \
-	   	 )? true : false )
+#define IS_GDATA_READY(peer, my_flag, my_sequence_number)       \
+    (((peer)->sequence_number == (my_sequence_number) &&        \
+      (peer)->flags[GATHER_FLAG][bcol_id] == (my_flag)          \
+      )? true : false )
 
-#define IS_PEER_READY(peer, my_flag, flag_index, my_sequence_number)\
-		((((volatile int64_t)(peer)->sequence_number > (my_sequence_number)) || \
-		  (((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
-		  ((peer)->flags[flag_index][bcol_id] == (my_flag))) \
-	   	 )? true : false )
+#define IS_PEER_READY(peer, my_flag, flag_index, my_sequence_number)    \
+    ((((volatile int64_t)(peer)->sequence_number > (my_sequence_number)) || \
+      (((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
+       ((peer)->flags[flag_index][bcol_id] == (my_flag)))               \
+      )? true : false )
 
-#define IS_ALLREDUCE_PEER_READY(peer, my_flag, my_sequence_number)\
-		 ((((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
-		   (((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag))||((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag) + 1)) \
-	   	 )? true : false )
+#define IS_ALLREDUCE_PEER_READY(peer, my_flag, my_sequence_number)      \
+    ((((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
+      (((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag))||((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag) + 1)) \
+      )? true : false )
 #endif
 
-#define IS_LAST_BCOL_FUNC(ml_args) \
-	    ((((ml_args)->n_of_this_type_in_collective == \
-		   (ml_args)->index_of_this_type_in_collective + 1 ) )? true : false)
+#define IS_LAST_BCOL_FUNC(ml_args)                                      \
+    ((((ml_args)->n_of_this_type_in_collective ==                       \
+       (ml_args)->index_of_this_type_in_collective + 1 ) )? true : false)
 
 static inline __opal_attribute_always_inline__
-            size_t bcol_basesmuma_data_offset_calc(
-                      mca_bcol_basesmuma_module_t *basesmuma_module)
+size_t bcol_basesmuma_data_offset_calc(
+                                       mca_bcol_basesmuma_module_t *basesmuma_module)
 {
     uint32_t offset = basesmuma_module->super.header_size;
     offset = ((offset + BCOL_HEAD_ALIGN - 1) / BCOL_HEAD_ALIGN) * BCOL_HEAD_ALIGN;
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_allgather.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_allgather.c
new file mode 100644
index 0000000000..0c38f62df7
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_allgather.c
@@ -0,0 +1,538 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
+/*
+  #define IS_AGDATA_READY(peer, my_flag, my_sequence_number)\
+  (((peer)->sequence_number == (my_sequence_number) && \
+  (peer)->flags[ALLGATHER_FLAG][bcol_id] >= (my_flag) \
+  )? true : false )
+*/
+
+#define CALC_ACTIVE_REQUESTS(active_requests,peers, tree_order) \
+    do{                                                         \
+        for( j = 0; j < (tree_order - 1); j++){                 \
+            if( 0 > peers[j] ) {                                \
+                /* set the bit */                               \
+                *active_requests ^= (1<<j);                     \
+            }                                                   \
+        }                                                       \
+    }while(0)
+
+
+
+/*
+ * Recursive K-ing allgather
+ */
+
+/*
+ *
+ * Recurssive k-ing algorithm
+ * Example k=3 n=9
+ *
+ *
+ * Number of Exchange steps = log (basek) n
+ * Number of steps in exchange step = k (radix)
+ *
+ */
+int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
+                                           struct coll_ml_function_t *const_args)
+{
+    /* local variables */
+    /* XXX -- FIXME -- This is never set */
+    int8_t  flag_offset;
+    volatile int8_t ready_flag;
+    mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
+    netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
+    int group_size = bcol_module->colls_no_user_data.size_of_group;
+    int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
+    uint32_t buffer_index = input_args->buffer_index;
+    int *active_requests =
+        &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
+
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
+    int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
+    int leading_dim, buff_idx, idx;
+
+    int i, j, probe;
+    int knt;
+    int src;
+    int recv_offset, recv_len;
+
+    int pow_k, tree_order;
+    int max_requests = 0; /* important to initialize this */
+
+    int matched = 0;
+    int64_t sequence_number=input_args->sequence_num;
+    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    int buff_offset = bcol_module->super.hier_scather_offset;
+
+
+    int pack_len = input_args->count * input_args->dtype->super.size;
+
+    void *data_addr = (void*)(
+        (unsigned char *) input_args->sbuf +
+        (size_t) input_args->sbuf_offset);
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char *peer_data_pointer;
+
+    /* control structures */
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
+
+#if 0
+    fprintf(stderr,"entering p2p allgather pack_len %d\n",pack_len);
+#endif
+    /* initialize the iteration counter */
+    buff_idx = input_args->src_desc->buffer_index;
+    leading_dim = bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+    /* initialize headers and ready flag */
+    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
+
+    /* initialize these */
+    *iteration = 0;
+    *active_requests = 0;
+    *status = 0;
+
+    /* k-nomial parameters */
+    tree_order = exchange_node->tree_order;
+    pow_k = exchange_node->log_tree_order;
+
+    /* calculate the maximum number of requests
+     * at each level each rank communicates with
+     * at most (k - 1) peers
+     * so if we set k - 1 bit fields in "max_requests", then
+     * we have max_request  == 2^(k - 1) -1
+     */
+    for(i = 0; i < (tree_order - 1); i++){
+        max_requests ^=  (1<<i);
+    }
+    /* let's begin the collective, starting with extra ranks and their
+     * respective proxies
+     */
+
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+        /* then I will signal to my proxy rank*/
+        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
+        ready_flag = flag_offset + 1 + pow_k + 2;
+        /* now, poll for completion */
+        src = exchange_node->rank_extra_sources_array[0];
+        peer_data_pointer = data_buffs[src].payload;
+        peer_ctl_pointer = data_buffs[src].ctl_struct;
+
+        /* calculate the offset */
+        knt = 0;
+        for(i = 0; i < group_size; i++){
+            knt += list_connected[i];
+        }
+        for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) {
+            if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
+                matched = 1;
+                /* we receive the entire message */
+                memcpy((void *)((unsigned char *) data_addr + buff_offset),
+                       (void *) ((unsigned char *) peer_data_pointer + buff_offset),
+                       knt * pack_len);
+
+                goto FINISHED;
+            }
+
+        }
+
+        /* save state and bail */
+        *iteration = -1;
+        return BCOL_FN_STARTED;
+
+    }else if ( 0 < exchange_node->n_extra_sources ) {
+
+        /* I am a proxy for someone */
+        src = exchange_node->rank_extra_sources_array[0];
+        peer_data_pointer = data_buffs[src].payload;
+        peer_ctl_pointer = data_buffs[src].ctl_struct;
+
+
+        knt = 0;
+        for(i = 0; i < src; i++){
+            knt += list_connected[i];
+        }
+
+        /* probe for extra rank's arrival */
+        for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) {
+            if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
+                matched = 1;
+                /* copy it in */
+                memcpy((void *)((unsigned char *) data_addr + knt*pack_len),
+                       (void *) ((unsigned char *) peer_data_pointer + knt*pack_len),
+                       pack_len * list_connected[src]);
+                goto MAIN_PHASE;
+            }
+        }
+        *status = ready_flag;
+        *iteration = -1;
+        return BCOL_FN_STARTED;
+
+
+    }
+
+MAIN_PHASE:
+    /* bump the ready flag */
+    ready_flag++;
+
+
+    /* we start the recursive k - ing phase */
+    for( *iteration = 0; *iteration < pow_k; (*iteration)++) {
+        /* announce my arrival */
+        MB();
+        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
+        /* calculate the number of active requests */
+        CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order);
+        /* Now post the recv's */
+        for( j = 0; j < (tree_order - 1); j++ ) {
+
+            /* recv phase */
+            src = exchange_node->rank_exchanges[*iteration][j];
+
+            if( src < 0 ) {
+                /* then not a valid rank, continue */
+
+                continue;
+            }
+
+            peer_data_pointer = data_buffs[src].payload;
+            peer_ctl_pointer = data_buffs[src].ctl_struct;
+            if( !(*active_requests&(1<<j))) {
+                /* then the bit hasn't been set, thus this peer
+                 * hasn't been processed at this level
+                 */
+                recv_offset = exchange_node->payload_info[*iteration][j].r_offset * pack_len;
+                recv_len = exchange_node->payload_info[*iteration][j].r_len * pack_len;
+                /* post the receive */
+                /* I am putting the probe loop as the inner most loop to achieve
+                 * better temporal locality
+                 */
+                matched = 0;
+                for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
+                    if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
+                        matched = 1;
+                        /* set this request's bit */
+                        *active_requests ^= (1<<j);
+                        /* get the data */
+                        memcpy((void *)((unsigned char *) data_addr + recv_offset),
+                               (void *)((unsigned char *) peer_data_pointer + recv_offset),
+                               recv_len);
+                    }
+                }
+            }
+
+
+        }
+        if( max_requests == *active_requests ){
+            /* bump the ready flag */
+            ready_flag++;
+            /*reset the active requests */
+            *active_requests = 0;
+        } else {
+            /* save state and hop out
+             * only the iteration needs to be tracked
+             */
+            *status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id];
+            return BCOL_FN_STARTED;
+        }
+    }
+
+    /* bump the flag one more time for the extra rank */
+    ready_flag = flag_offset + 1 + pow_k + 2;
+
+    /* finish off the last piece, send the data back to the extra  */
+    if( 0 < exchange_node->n_extra_sources ) {
+        /* simply announce my arrival */
+        MB();
+        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
+
+    }
+
+FINISHED:
+    /* bump this up */
+    my_ctl_pointer->starting_flag_value[bcol_id]++;
+    return BCOL_FN_COMPLETE;
+}
+
+
+/* allgather progress function */
+
+int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args,
+                                               struct coll_ml_function_t *const_args)
+{
+
+
+    /* local variables */
+    int8_t flag_offset;
+    uint32_t buffer_index = input_args->buffer_index;
+    volatile int8_t ready_flag;
+    mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
+    netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
+    int group_size = bcol_module->colls_no_user_data.size_of_group;
+    int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
+    int *active_requests =
+        &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
+
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
+    int *iter = iteration; /* double alias */
+    int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
+    int leading_dim, idx, buff_idx;
+
+    int i, j, probe;
+    int knt;
+    int src;
+    int recv_offset, recv_len;
+    int max_requests = 0; /* critical to set this */
+    int pow_k, tree_order;
+
+    int matched = 0;
+    int64_t sequence_number=input_args->sequence_num;
+    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    int buff_offset = bcol_module->super.hier_scather_offset;
+
+    int pack_len = input_args->count * input_args->dtype->super.size;
+
+    void *data_addr = (void*)(
+        (unsigned char *) input_args->sbuf +
+        (size_t) input_args->sbuf_offset);
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char *peer_data_pointer;
+
+    /* control structures */
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
+
+#if 0
+    fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n",my_rank,
+            *active_requests,*iter,*status);
+#endif
+    buff_idx = input_args->src_desc->buffer_index;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+    /* increment the starting flag by one and return */
+    /* flag offset seems unnecessary here */
+    flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];
+    ready_flag = *status;
+    my_ctl_pointer->sequence_number = sequence_number;
+    /* k-nomial parameters */
+    tree_order = exchange_node->tree_order;
+    pow_k = exchange_node->log_tree_order;
+
+    /* calculate the maximum number of requests
+     * at each level each rank communicates with
+     * at most (k - 1) peers
+     * so if we set k - 1 bit fields in "max_requests", then
+     * we have max_request  == 2^(k - 1) -1
+     */
+    for(i = 0; i < (tree_order - 1); i++){
+        max_requests ^= (1<<i);
+    }
+
+    /* let's begin the collective, starting with extra ranks and their
+     * respective proxies
+     */
+
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+        /* If I'm in here, then I must be looking for data */
+        ready_flag = flag_offset + 1 + pow_k + 2;
+
+        src = exchange_node->rank_extra_sources_array[0];
+        peer_data_pointer = data_buffs[src].payload;
+        peer_ctl_pointer = data_buffs[src].ctl_struct;
+
+        /* calculate the offset */
+        knt = 0;
+        for(i = 0; i < group_size; i++){
+            knt += list_connected[i];
+        }
+        for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) {
+            if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
+                matched = 1;
+                /* we receive the entire message */
+                memcpy((void *)((unsigned char *) data_addr + buff_offset),
+                       (void *) ((unsigned char *) peer_data_pointer + buff_offset),
+                       knt * pack_len);
+
+                goto FINISHED;
+            }
+
+        }
+
+        /* haven't found it, state is saved, bail out */
+        return BCOL_FN_STARTED;
+
+    }else if ( ( -1 == *iteration ) && (0 < exchange_node->n_extra_sources) ) {
+
+        /* I am a proxy for someone */
+        src = exchange_node->rank_extra_sources_array[0];
+        peer_data_pointer = data_buffs[src].payload;
+        peer_ctl_pointer = data_buffs[src].ctl_struct;
+
+        knt = 0;
+        for(i = 0; i < src; i++){
+            knt += list_connected[i];
+        }
+
+        /* probe for extra rank's arrival */
+        for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) {
+            if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
+                matched = 1;
+                /* copy it in */
+                memcpy((void *)((unsigned char *) data_addr + knt*pack_len),
+                       (void *) ((unsigned char *) peer_data_pointer + knt*pack_len),
+                       pack_len * list_connected[src]);
+
+                ready_flag++;
+                *iteration = 0;
+                goto MAIN_PHASE;
+            }
+        }
+        return BCOL_FN_STARTED;
+
+    }
+
+MAIN_PHASE:
+
+    /* start the recursive k - ing phase */
+    for( *iter=*iteration; *iter < pow_k; (*iter)++) {
+        /* I am ready at this level */
+        MB();
+        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
+        if( 0 == *active_requests ) {
+            /* flip some bits, if we don't have active requests from a previous visit */
+            CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iter],tree_order);
+        }
+        for( j = 0; j < (tree_order - 1); j++ ) {
+
+            /* recv phase */
+            src = exchange_node->rank_exchanges[*iter][j];
+
+            if( src < 0 ) {
+                /* then not a valid rank, continue
+                 */
+                continue;
+            }
+
+            peer_data_pointer = data_buffs[src].payload;
+            peer_ctl_pointer = data_buffs[src].ctl_struct;
+            if( !(*active_requests&(1<<j))){
+
+                /* then this peer hasn't been processed at this level */
+                recv_offset = exchange_node->payload_info[*iter][j].r_offset * pack_len;
+                recv_len = exchange_node->payload_info[*iter][j].r_len * pack_len;
+                /* I am putting the probe loop as the inner most loop to achieve
+                 * better temporal locality
+                 */
+                matched = 0;
+                for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
+                    if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
+                        matched = 1;
+                        /* flip the request's bit */
+                        *active_requests ^= (1<<j);
+                        /* copy the data */
+                        memcpy((void *)((unsigned char *) data_addr + recv_offset),
+                               (void *)((unsigned char *) peer_data_pointer + recv_offset),
+                               recv_len);
+                    }
+                }
+            }
+
+
+        }
+        if( max_requests == *active_requests ){
+            /* bump the ready flag */
+            ready_flag++;
+            /* reset the active requests for the next level */
+            *active_requests = 0;
+            /* calculate the number of active requests
+             * logically makes sense to do it here. We don't
+             * want to inadvertantly flip a bit to zero that we
+             * set previously
+             */
+        } else {
+            /* state is saved hop out
+             */
+            *status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id];
+            return BCOL_FN_STARTED;
+        }
+    }
+    /* bump the flag one more time for the extra rank */
+    ready_flag = flag_offset + 1 + pow_k + 2;
+
+    /* finish off the last piece, send the data back to the extra  */
+    if( 0 < exchange_node->n_extra_sources ) {
+        /* simply announce my arrival */
+        MB();
+        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
+
+    }
+
+FINISHED:
+    /* bump this up for others to see */
+    my_ctl_pointer->starting_flag_value[bcol_id]++;
+    return BCOL_FN_COMPLETE;
+}
+
+/* Register allreduce functions to the BCOL function table,
+ * so they can be selected
+ */
+int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    comm_attribs.bcoll_type = BCOL_ALLGATHER;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 1024 * 1024;
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000; /* range 1 */
+
+    inv_attribs.datatype_bitmap = 0xffffffff;
+    inv_attribs.op_types_bitmap = 0xffffffff;
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                                 bcol_basesmuma_k_nomial_allgather_init,
+                                 bcol_basesmuma_k_nomial_allgather_progress);
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_allreduce.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_allreduce.c
new file mode 100644
index 0000000000..ce3890563e
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_allreduce.c
@@ -0,0 +1,602 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+
+#include "opal/include/opal_stdint.h"
+
+#include "bcol_basesmuma.h"
+
+static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, coll_ml_function_t *c_input_args);
+
+int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    comm_attribs.bcoll_type = BCOL_ALLREDUCE;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 16;
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+
+    /* selection logic at the ml level specifies a
+     * request for a non-blocking algorithm
+     * however, these algorithms are blocking
+     * following what was done at the p2p level
+     * we will specify non-blocking, but beware,
+     * these algorithms are blocking and will not make use
+     * of the progress engine
+     */
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000;
+    inv_attribs.datatype_bitmap = 0xffffffff;
+    inv_attribs.op_types_bitmap = 0xffffffff;
+
+    /* Set attributes for fanin fanout algorithm */
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                                 bcol_basesmuma_allreduce_intra_fanin_fanout,
+                                 bcol_basesmuma_allreduce_intra_fanin_fanout_progress);
+    /* Differs only in comm size */
+
+    comm_attribs.data_src = DATA_SRC_UNKNOWN;
+    comm_attribs.waiting_semantics = BLOCKING;
+
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 8;
+
+    /* Set attributes for recursive doubling algorithm */
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                                 bcol_basesmuma_allreduce_intra_recursive_doubling,
+                                 NULL);
+
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Small data fanin reduce
+ * ML buffers are used for both payload and control structures
+ * This functions works with hierarchical allreduce and
+ * progress engine
+ */
+static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node,
+                                   int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype,
+                                   volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift)
+{
+    volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    int64_t sequence_number = my_ctl_pointer->sequence_number;
+    int8_t ready_flag = my_ctl_pointer->ready_flag;
+    int group_size = bcol_module->colls_no_user_data.size_of_group;
+
+    if (LEAF_NODE != my_reduction_node->my_node_type) {
+        volatile char *child_data_pointer;
+        volatile void *child_rbuf;
+
+        /* for each child */
+        /* my_result_data = child_result_data (op) my_source_data */
+
+        for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) {
+            int child_rank = my_reduction_node->children_ranks[child] + process_shift;
+
+            if (group_size <= child_rank){
+                child_rank -= group_size;
+            }
+
+            child_ctl_pointer = data_buffs[child_rank].ctl_struct;
+
+            if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
+                *iteration = child;
+                return BCOL_FN_STARTED;
+            }
+
+            child_data_pointer = data_buffs[child_rank].payload;
+            child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id];
+
+            ompi_op_reduce(op, (void *)child_rbuf, (void *)rbuf, count, dtype);
+        } /* end child loop */
+    }
+
+    if (ROOT_NODE != my_reduction_node->my_node_type) {
+        opal_atomic_wmb ();
+        my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
+    }
+
+    /* done with this step. move on to fan out */
+    *iteration = -1;
+
+    return BCOL_FN_COMPLETE;
+}
+
+static int allreduce_fanout (mca_bcol_basesmuma_module_t *bcol_module, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer,
+                             volatile void *my_data_pointer, int process_shift, volatile mca_bcol_basesmuma_payload_t *data_buffs,
+                             int sequence_number, int group_size, int rbuf_offset, size_t pack_len)
+{
+    volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    int8_t ready_flag = my_ctl_pointer->ready_flag + 1;
+    netpatterns_tree_node_t *my_fanout_read_tree;
+    volatile void *parent_data_pointer;
+    int my_fanout_parent, my_rank;
+    void *parent_rbuf, *rbuf;
+
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_rank]);
+
+    if (ROOT_NODE != my_fanout_read_tree->my_node_type) {
+        my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
+        if (group_size <= my_fanout_parent) {
+            my_fanout_parent -= group_size;
+        }
+
+        rbuf = (void *)((char *) my_data_pointer + rbuf_offset);
+
+        /*
+         * Get parent payload data and control data.
+         * Get the pointer to the base address of the parent's payload buffer.
+         * Get the parent's control buffer.
+         */
+        parent_data_pointer = data_buffs[my_fanout_parent].payload;
+        parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct;
+
+        parent_rbuf = (void *) ((char *) parent_data_pointer + rbuf_offset);
+
+        /* Wait until parent signals that data is ready */
+        /* The order of conditions checked in this loop is important, as it can
+         * result in a race condition.
+         */
+        if (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, ALLREDUCE_FLAG, bcol_id)) {
+            return BCOL_FN_STARTED;
+        }
+
+        assert (parent_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] == ready_flag);
+
+        /* Copy the rank to a shared buffer writable by the current rank */
+        memcpy ((void *) rbuf, (const void*) parent_rbuf, pack_len);
+    }
+
+    if (LEAF_NODE != my_fanout_read_tree->my_node_type) {
+        opal_atomic_wmb ();
+
+        /* Signal to children that they may read the data from my shared buffer (bump the ready flag) */
+        my_ctl_pointer->flags[ALLREDUCE_FLAG][bcol_id] = ready_flag;
+    }
+
+    my_ctl_pointer->starting_flag_value[bcol_id] += 1;
+
+    return BCOL_FN_COMPLETE;
+
+}
+
+static int bcol_basesmuma_allreduce_intra_fanin_fanout_progress (bcol_function_args_t *input_args, coll_ml_function_t *c_input_args)
+{
+    mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+    int buff_idx = buff_idx = input_args->src_desc->buffer_index;
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
+    void *data_addr = (void *) input_args->src_desc->data_addr;
+    int my_node_index, my_rank, group_size, leading_dim, idx;
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    int64_t sequence_number = input_args->sequence_num;
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    netpatterns_tree_node_t *my_reduction_node;
+    struct ompi_op_t *op = input_args->op;
+    volatile void *my_data_pointer;
+    int count = input_args->count;
+    int rc, process_shift;
+    ptrdiff_t lb, extent;
+    volatile void *rbuf;
+
+    /* get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim = bcol_module->colls_no_user_data.size_of_group;
+    idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    /* Align node index to around sbgp root */
+    process_shift = input_args->root;
+    my_node_index = my_rank - input_args->root;
+    if (0 > my_node_index ) {
+        my_node_index += group_size;
+    }
+
+    data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
+    /* Get control structure and payload buffer */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+    my_data_pointer = (volatile char *) data_addr;
+
+    my_data_pointer = (volatile char *) data_addr;
+    rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
+
+    /***************************
+     * Fan into root phase
+     ***************************/
+
+    my_reduction_node = &(bcol_module->reduction_tree[my_node_index]);
+    if (-1 != *iteration) {
+        rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer,
+                              dtype, data_buffs, count, op, process_shift);
+        if (BCOL_FN_COMPLETE != rc) {
+            return rc;
+        }
+    }
+
+    /* there might be non-contig dtype - so compute the length with get_extent */
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+
+    /***************************
+     * Fan out from root
+     ***************************/
+
+    /* all nodes will have the result after fanout */
+    input_args->result_in_rbuf = true;
+
+    /* Signal that you are ready for fanout phase */
+    return allreduce_fanout (bcol_module, my_ctl_pointer, my_data_pointer, process_shift, data_buffs,
+                             sequence_number, group_size, input_args->rbuf_offset, count * (size_t) extent);
+}
+
+/**
+ * Shared memory blocking allreduce.
+ */
+int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args, coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+    int buff_idx = buff_idx = input_args->src_desc->buffer_index;
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
+    void *data_addr = (void *) input_args->src_desc->data_addr;
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    int rc, my_rank, leading_dim, idx;
+    volatile void *my_data_pointer;
+    volatile void *sbuf, *rbuf;
+    int8_t ready_flag;
+
+    /* get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    leading_dim = bcol_module->colls_no_user_data.size_of_group;
+    idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0);
+
+    data_buffs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx;
+    /* Get control structure */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+    my_data_pointer = (volatile char *) data_addr;
+    rbuf = (volatile void *)((char *) my_data_pointer + input_args->rbuf_offset);
+    sbuf = (volatile void *)((char *) my_data_pointer + input_args->sbuf_offset);
+
+    /* Setup resource recycling */
+    /* Set for multiple instances of bcols */
+    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, input_args->sequence_num, bcol_id);
+
+    if (sbuf != rbuf) {
+        rc = ompi_datatype_copy_content_same_ddt (dtype, input_args->count, (char *)rbuf,
+                                                  (char *)sbuf);
+        if( 0 != rc ) {
+            return OMPI_ERROR;
+        }
+    }
+
+    *iteration = 0;
+    my_ctl_pointer->ready_flag = ready_flag;
+
+    return bcol_basesmuma_allreduce_intra_fanin_fanout_progress (input_args, c_input_args);
+}
+
+
+
+/* this thing uses the old bcol private control structures */
+int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
+                                                      coll_ml_function_t *c_input_args)
+{
+
+    int my_rank,group_size,my_node_index;
+    int pair_rank, exchange, extra_rank, payload_len;
+    size_t dt_size;
+    int read_offset, write_offset;
+    volatile void *my_data_pointer;
+    volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer = NULL,
+        *partner_ctl_pointer = NULL,
+        *extra_ctl_pointer = NULL;
+    volatile void *my_read_pointer, *my_write_pointer, *partner_read_pointer,
+        *extra_rank_readwrite_data_pointer,*extra_rank_read_data_pointer;
+    mca_bcol_basesmuma_module_t* bcol_module =
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+
+    int8_t ready_flag;
+    int sbuf_offset,rbuf_offset,flag_offset;
+    int root,count;
+    struct ompi_op_t *op;
+    int64_t sequence_number=input_args->sequence_num;
+    struct ompi_datatype_t *dtype;
+    int first_instance;
+    int leading_dim,idx;
+    int buff_idx;
+    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+    /*volatile void **data_buffs;*/
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    netpatterns_pair_exchange_node_t *my_exchange_node;
+
+
+    /*
+     * Get addressing information
+     */
+    buff_idx = input_args->src_desc->buffer_index;
+
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim = bcol_module->colls_no_user_data.size_of_group;
+    idx = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    /*
+     * Get SM control structures and payload buffers
+     */
+    ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
+        bcol_module->colls_with_user_data.ctl_buffs+idx;
+    /*data_buffs = (volatile void **)
+      bcol_module->colls_with_user_data.data_buffs+idx;*/
+
+    data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs + idx;
+
+
+    /*
+     * Get control structure and payload buffer
+     */
+    my_ctl_pointer = ctl_structs[my_rank];
+    if (my_ctl_pointer->sequence_number < sequence_number) {
+        first_instance=1;
+    }
+    my_data_pointer = data_buffs[my_rank].payload;
+
+    /*
+     * Align node index to around sbgp root
+     */
+    root = input_args->root;
+    my_node_index = my_rank - root;
+    if (0 > my_node_index) {
+        my_node_index += group_size;
+    }
+
+    /*
+     * Get data from arguments
+     */
+    sbuf_offset = input_args->sbuf_offset;
+    rbuf_offset = input_args->rbuf_offset;
+    op   = input_args->op;
+    count = input_args->count;
+    dtype = input_args->dtype;
+
+    /*
+     * Get my node for the reduction tree
+     */
+    my_exchange_node = &(bcol_module->recursive_doubling_tree);
+
+
+    if (first_instance) {
+        my_ctl_pointer->index = 1;
+        my_ctl_pointer->starting_flag_value = 0;
+        flag_offset = 0;
+        my_ctl_pointer->flag = -1;
+        /*
+          for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
+          my_ctl_pointer->flags[ALLREDUCE_FLAG] = -1;
+          }
+        */
+    } else {
+        my_ctl_pointer->index++;
+        flag_offset = my_ctl_pointer->starting_flag_value;
+    }
+
+    /* signal that I have arrived */
+    /* MB(); */
+    my_ctl_pointer->sequence_number = sequence_number;
+
+    /* If we use this buffer more than once by an sm module in
+     * a given collective, will need to distinguish between instances, so
+     * we pick up the right data.
+     */
+    ready_flag = flag_offset + sequence_number + 1;
+
+    /*
+     * Set up pointers for using during recursive doubling phase
+     */
+    read_offset = sbuf_offset;
+    write_offset = rbuf_offset;
+    fprintf(stderr,"read offset %d write offset %d\n",read_offset,write_offset);
+    my_read_pointer =  (volatile void *)((char *) my_data_pointer + read_offset);
+    my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
+
+    /*
+     * When there are non-power 2 nodes, the extra nodes' data is copied and
+     * reduced by partner exchange nodes.
+     * Extra nodes: Nodes with rank greater nearest power of 2
+     * Exchange nodes: Nodes with rank lesser than nearest power of 2 that
+     * partner with extras nodes during reduction
+     */
+
+    if (0 < my_exchange_node->n_extra_sources) {
+        /*
+         * Signal extra node that data is ready
+         */
+        opal_atomic_wmb ();
+
+        my_ctl_pointer->flag = ready_flag;
+
+        if (EXCHANGE_NODE == my_exchange_node->node_type) {
+            extra_rank = my_exchange_node->rank_extra_source;
+            extra_ctl_pointer = ctl_structs[extra_rank];
+            extra_rank_readwrite_data_pointer = (void *) ((char *) data_buffs[extra_rank].payload +
+                                                          read_offset);
+
+            /*
+             * Wait for data to get ready
+             */
+            while (!((sequence_number == extra_ctl_pointer->sequence_number) &&
+                     (extra_ctl_pointer->flag >= ready_flag))){
+            }
+
+            ompi_op_reduce(op,(void *)extra_rank_readwrite_data_pointer,
+                           (void *)my_read_pointer, count, dtype);
+        }
+    }
+
+
+    /* --Exchange node that reduces with extra node --: Signal to extra node that data is read
+     * --Exchange node that doesn't reduce data with extra node --: This assignment
+     * is used so it can sync with other nodes during exchange phase
+     * --Extra node--: It can pass to next phase
+     */
+    ready_flag++;
+    /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
+    my_ctl_pointer->flag = ready_flag;
+
+
+    /*
+     * Exchange data with all the nodes that are less than max_power_2
+     */
+    for (exchange=0 ; exchange < my_exchange_node->n_exchanges ; exchange++) {
+        int tmp=0;
+
+        /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
+        my_ctl_pointer->flag = ready_flag;
+        pair_rank=my_exchange_node->rank_exchanges[exchange];
+        partner_ctl_pointer = ctl_structs[pair_rank];
+        partner_read_pointer = (volatile void *) ((char *)data_buffs[pair_rank].payload + read_offset);
+
+        my_read_pointer =  (volatile void *)((char *) my_data_pointer + read_offset);
+        my_write_pointer = (volatile void *)((char *) my_data_pointer + write_offset);
+
+        /*
+         * Wait for partner to be ready, so we can read
+         */
+        /*
+          JSL ----  FIX ME  !!!!! MAKE ME COMPLIANT WITH NEW BUFFERS
+          while (!IS_ALLREDUCE_PEER_READY(partner_ctl_pointer,
+          ready_flag, sequence_number)) {
+          }
+        */
+
+        /*
+         * Perform reduction operation
+         */
+        ompi_3buff_op_reduce(op,(void *)my_read_pointer, (void *)partner_read_pointer,
+                             (void *)my_write_pointer, count, dtype);
+
+
+        /*
+         * Signal that I am done reading my partner's data
+         */
+        ready_flag++;
+        /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
+        my_ctl_pointer->flag = ready_flag;
+
+        while (ready_flag > partner_ctl_pointer->flag){
+            opal_progress();
+        }
+
+        /*
+         * Swap read and write offsets
+         */
+        tmp = read_offset;
+        read_offset = write_offset;
+        write_offset = tmp;
+
+    }
+
+
+    /*
+     * Copy data in from the "extra" source, if need be
+     */
+
+    if (0 < my_exchange_node->n_extra_sources) {
+
+        if (EXTRA_NODE == my_exchange_node->node_type) {
+
+            int extra_rank_read_offset=-1,my_write_offset=-1;
+
+            /* Offset the ready flag to sync with
+             * exchange node which might going through exchange phases
+             * unlike the extra node
+             */
+            ready_flag = ready_flag + my_exchange_node->log_2;
+
+            if (my_exchange_node->log_2%2) {
+                extra_rank_read_offset = rbuf_offset;
+                my_write_offset = rbuf_offset;
+
+            } else {
+                extra_rank_read_offset = sbuf_offset;
+                my_write_offset = sbuf_offset;
+
+            }
+
+            my_write_pointer = (volatile void*)((char *)my_data_pointer + my_write_offset);
+            extra_rank = my_exchange_node->rank_extra_source;
+            extra_ctl_pointer = ctl_structs[extra_rank];
+
+            extra_rank_read_data_pointer = (volatile void *) ((char *)data_buffs[extra_rank].payload +
+                                                              extra_rank_read_offset);
+
+            /*
+             * Wait for the exchange node to be ready
+             */
+            ompi_datatype_type_size(dtype, &dt_size);
+            payload_len = count*dt_size;
+#if 0
+            fix me JSL !!!!!
+                while (!IS_DATA_READY(extra_ctl_pointer, ready_flag, sequence_number)){
+                }
+#endif
+            memcpy((void *)my_write_pointer,(const void *)
+                   extra_rank_read_data_pointer, payload_len);
+
+            ready_flag++;
+            /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
+            my_ctl_pointer->flag = ready_flag;
+
+
+        } else {
+
+            /*
+             * Signal parent that data is ready
+             */
+            MB();
+            /*my_ctl_pointer->flags[ALLREDUCE_FLAG] = ready_flag;*/
+            my_ctl_pointer->flag = ready_flag;
+
+            /* wait until child is done to move on - this buffer will
+             *   be reused for the next stripe, so don't want to move
+             *   on too quick.
+             */
+            extra_rank = my_exchange_node->rank_extra_source;
+            extra_ctl_pointer = ctl_structs[extra_rank];
+        }
+    }
+
+    input_args->result_in_rbuf = my_exchange_node->log_2 & 1;
+
+    my_ctl_pointer->starting_flag_value += 1;
+
+    return BCOL_FN_COMPLETE;
+}
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast_prime.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast_prime.c
index 746a9210b9..12d26fe7b0 100644
--- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast_prime.c
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_bcast_prime.c
@@ -1,7 +1,10 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
- * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
  * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -29,197 +32,196 @@
 
 /* includes shared memory optimization */
 
-#define  BCOL_BASESMUMA_SM_PROBE(src_list, n_src, my_index, matched, src)   \
-do {                                                                        \
-    int j;                                                                  \
-    for( j = 0; j < n_src; j++) {                                           \
-        parent_ctl_pointer = data_buffs[src_list[j]].ctl_struct;            \
-        parent_data_pointer = data_buffs[src_list[j]].payload;              \
-        if( IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) { \
-            src = src_list[j];                                              \
-            matched = 1;                                                    \
-            break;                                                          \
-        }                                                                   \
-    }                                                                       \
-} while(0) 
+#define  BCOL_BASESMUMA_SM_PROBE(src_list, n_src, my_index, matched, src) \
+  do {                                                                    \
+    int j;                                                                \
+    for( j = 0; j < n_src; j++) {                                         \
+      parent_ctl_pointer = data_buffs[src_list[j]].ctl_struct;            \
+      parent_data_pointer = data_buffs[src_list[j]].payload;              \
+      if( IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) { \
+        src = src_list[j];                                                \
+        matched = 1;                                                      \
+        break;                                                            \
+      }                                                                   \
+    }                                                                     \
+  } while(0)
 
 /*
-#define IS_LARGE_DATA_READY(peer, my_flag, my_sequence_number) \
-    (((peer)->sequence_number == (my_sequence_number) && \
-      (peer)->flags[BCAST_FLAG] >= (my_flag) \
-     )? true : false )
+  #define IS_LARGE_DATA_READY(peer, my_flag, my_sequence_number) \
+  (((peer)->sequence_number == (my_sequence_number) && \
+  (peer)->flags[BCAST_FLAG] >= (my_flag) \
+  )? true : false )
 */
 
 /*
-#define IS_KNOWN_ROOT_DATA_READY(peer, my_flag, my_sequence_number) \
-    (((peer)->sequence_number == (my_sequence_number) && \
-      (peer)->flags[BCAST_FLAG][bcol_id] >= (my_flag) \
-     )? true : false )
+  #define IS_KNOWN_ROOT_DATA_READY(peer, my_flag, my_sequence_number) \
+  (((peer)->sequence_number == (my_sequence_number) && \
+  (peer)->flags[BCAST_FLAG][bcol_id] >= (my_flag) \
+  )? true : false )
 */
 
-#define  BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(src_list, n_src, my_index, matched, src, flag_index, bcol_id)  \
-do {                                                                              \
-    int j;                                                                        \
-    for( j = 0; j < n_src; j++) {                                                 \
-       /* fprintf(stderr,"my_rank %d and %d\n",my_rank,1);         */             \
-        if(src_list[j] != -1) {                                                   \
-            parent_ctl_pointer = ctl_structs[src_list[j]];                        \
-            parent_data_pointer = (void *) data_buffs[src_list[j]].ctl_struct;                        \
-            /*fprintf(stderr,"my_rank %d ready flag %d partner flag %d and %d\n",my_rank,ready_flag,parent_ctl_pointer->flag,2);    */ \
-            if( IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, flag_index, bcol_id)) {   \
-                src = src_list[j];                                                \
-                matched = 1;                                                      \
-                index = j;   \
-                /*  fprintf(stderr,"found it from %d!\n",src);*/                  \
-                break;                                                        \
-            }                                                                     \
-        }                                                                         \
-    }                                                                             \
-} while(0) 
+#define  BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(src_list, n_src, my_index, matched, src, flag_index, bcol_id) \
+  do {                                                                        \
+    int j;                                                                \
+    for( j = 0; j < n_src; j++) {                                        \
+      /* fprintf(stderr,"my_rank %d and %d\n",my_rank,1);         */        \
+      if(src_list[j] != -1) {                                                \
+        parent_ctl_pointer = ctl_structs[src_list[j]];                        \
+        parent_data_pointer = (void *) data_buffs[src_list[j]].ctl_struct; \
+        /*fprintf(stderr,"my_rank %d ready flag %d partner flag %d and %d\n",my_rank,ready_flag,parent_ctl_pointer->flag,2);    */ \
+        if( IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, flag_index, bcol_id)) { \
+          src = src_list[j];                                                \
+          matched = 1;                                                        \
+          index = j;                                                        \
+          /*  fprintf(stderr,"found it from %d!\n",src);*/                \
+          break;                                                        \
+        }                                                                \
+      }                                                                        \
+    }                                                                        \
+  } while(0)
 
-#define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask)  \
-do {                                                                                            \
-    int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root :          \
-        my_group_index - group_root + group_size;                                               \
-    radix_mask = 1;                                                                             \
-    while (radix_mask < group_size) {                                                           \
-        if (relative_rank % (radix * radix_mask)) {                                             \
-            data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root;  \
-            if (data_src >= group_size) data_src -= group_size;                                 \
-            break;                                                                              \
-        }                                                                                       \
-        radix_mask *= radix;                                                                    \
-    }                                                                                           \
-} while (0)
+#define K_NOMIAL_DATA_SRC(radix, my_group_index, group_size, group_root, data_src, radix_mask) \
+  do {                                                                        \
+    int relative_rank = (my_group_index >= group_root) ? my_group_index - group_root : \
+      my_group_index - group_root + group_size;                                \
+    radix_mask = 1;                                                        \
+    while (radix_mask < group_size) {                                        \
+      if (relative_rank % (radix * radix_mask)) {                        \
+        data_src = relative_rank/(radix * radix_mask) * (radix * radix_mask) + group_root; \
+        if (data_src >= group_size) data_src -= group_size;                \
+        break;                                                                \
+      }                                                                        \
+      radix_mask *= radix;                                                \
+    }                                                                        \
+  } while (0)
 
 int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args)
+                                            coll_ml_function_t *c_input_args)
 {
-    /* local variables */
-    mca_bcol_basesmuma_module_t* bcol_module=
-        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
-    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
-    int i, matched = 0;
-    int group_size;
-    int my_rank; 
-    int leading_dim, 
-        buff_idx, 
-        idx;
-    int count = input_args->count;
-    struct ompi_datatype_t* dtype = input_args->dtype;
-    int64_t sequence_number = input_args->sequence_num;
-    int radix = 
-        mca_bcol_basesmuma_component.k_nomial_radix;
-    int radix_mask;
-    int16_t data_src = -1;
-    
-    volatile int8_t ready_flag;
-    int bcol_id = (int) bcol_module->super.bcol_id;
-    volatile mca_bcol_basesmuma_payload_t *data_buffs;
-    volatile char* parent_data_pointer;
-    volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
-    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
-    
-    size_t pack_len = 0, 
-           dt_size;
-    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr + 
-                   input_args->sbuf_offset);
+  /* local variables */
+  mca_bcol_basesmuma_module_t* bcol_module=
+    (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+  mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+  int i, matched = 0;
+  int group_size;
+  int my_rank;
+  int leading_dim,
+    buff_idx,
+    idx;
+  int count = input_args->count;
+  struct ompi_datatype_t* dtype = input_args->dtype;
+  int64_t sequence_number = input_args->sequence_num;
+  int radix =
+    mca_bcol_basesmuma_component.k_nomial_radix;
+  int radix_mask;
+  int16_t data_src = -1;
 
-#if 0 
-    fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
-    fflush(stderr); 
+  volatile int8_t ready_flag;
+  int bcol_id = (int) bcol_module->super.bcol_id;
+  volatile mca_bcol_basesmuma_payload_t *data_buffs;
+  volatile char* parent_data_pointer;
+  volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
+  volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+
+  size_t pack_len = 0;
+  void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
+                             input_args->sbuf_offset);
+
+#if 0
+  fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
+  fflush(stderr);
 #endif
 
 
-    /* we will work only on packed data - so compute the length*/
-    BASESMUMA_VERBOSE(3, ("Calling bcol_basesmuma_bcast_k_nomial_knownroot"));
-    ompi_datatype_type_size(dtype, &dt_size);
-    pack_len = count * dt_size;
-    /* Some hierarchical algorithms have data that is accumulated at each step
-     * this factor accounts for this
-     */
-    pack_len = pack_len*input_args->hier_factor;
-    buff_idx = input_args->buffer_index;
+  /* we will work only on packed data - so compute the length*/
+  BASESMUMA_VERBOSE(3, ("Calling bcol_basesmuma_bcast_k_nomial_knownroot"));
 
-    /* Get addressing information */ 
-    my_rank     = bcol_module->super.sbgp_partner_module->my_index;
-    group_size  = bcol_module->colls_no_user_data.size_of_group;
-    leading_dim = bcol_module->colls_no_user_data.size_of_group;
-    idx         = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
-    data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
-        bcol_module->colls_with_user_data.data_buffs + idx;
+  pack_len = mca_bcol_base_get_buff_length(dtype, count);
+  /* Some hierarchical algorithms have data that is accumulated at each step
+   * this factor accounts for this
+   */
+  pack_len = pack_len*input_args->hier_factor;
+  buff_idx = input_args->buffer_index;
 
-    /* Set pointer to current proc ctrl region */
-    my_ctl_pointer = data_buffs[my_rank].ctl_struct; 
-    
-    /* setup resource recycling */
-    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
-    /* removing dependence on sequence number */
-    /* I believe this is resolved now with the signaling flags */
-/*
+  /* Get addressing information */
+  my_rank     = bcol_module->super.sbgp_partner_module->my_index;
+  group_size  = bcol_module->colls_no_user_data.size_of_group;
+  leading_dim = bcol_module->colls_no_user_data.size_of_group;
+  idx         = SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+  data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
+    bcol_module->colls_with_user_data.data_buffs + idx;
+
+  /* Set pointer to current proc ctrl region */
+  my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+  /* setup resource recycling */
+  BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
+  /* removing dependence on sequence number */
+  /* I believe this is resolved now with the signaling flags */
+  /*
     ready_temp = 1 + (int8_t) flag_offset + (int8_t) bcol_id;
     if( ready_temp >= my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
-        ready_flag = ready_temp;
+    ready_flag = ready_temp;
     } else {
-        ready_flag =  my_ctl_pointer->flags[BCAST_FLAG][bcol_id];
+    ready_flag =  my_ctl_pointer->flags[BCAST_FLAG][bcol_id];
     }
     MB();
     my_ctl_pointer->sequence_number = sequence_number;
-*/
+  */
 
 
-    /* non-blocking broadcast algorithm */
+  /* non-blocking broadcast algorithm */
 
-    /* If I am the root, then signal ready flag */
-    if(input_args->root_flag) {
-        BASESMUMA_VERBOSE(10,("I am the root of the data"));
-        /*
-         * signal ready flag
-         */
-        MB();
-        my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
-
-        /* root is finished */
-        goto Release;
-    }
-
-
-    /* Calculate source of the data */
-    K_NOMIAL_DATA_SRC(radix, my_rank, group_size, 
-            input_args->root_route->rank, data_src, radix_mask);
-
-
-    parent_ctl_pointer = data_buffs[data_src].ctl_struct;
-    parent_data_pointer = data_buffs[data_src].payload;
-
-    for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
-
-        if(IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, BCAST_FLAG, bcol_id)) { 
-            matched = 1;
-            break;
-        }
-    }
-
-    /* If not matched, then hop out and put me on progress list */
-    if(0 == matched ) {
-        BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
-        return BCOL_FN_NOT_STARTED;
-    }
-
-    /* else, we found our root within the group ... */
-    BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", data_src));
-
-    /* copy the data */
-    memcpy(data_addr, (void *) parent_data_pointer, pack_len);
-    /* set the memory barrier to ensure completion */
+  /* If I am the root, then signal ready flag */
+  if(input_args->root_flag) {
+    BASESMUMA_VERBOSE(10,("I am the root of the data"));
+    /*
+     * signal ready flag
+     */
     MB();
-    /* signal that I am done */
     my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
 
+    /* root is finished */
+    goto Release;
+  }
 
-Release:
-    my_ctl_pointer->starting_flag_value[bcol_id]++;
-    return BCOL_FN_COMPLETE;
+
+  /* Calculate source of the data */
+  K_NOMIAL_DATA_SRC(radix, my_rank, group_size,
+                    input_args->root_route->rank, data_src, radix_mask);
+
+
+  parent_ctl_pointer = data_buffs[data_src].ctl_struct;
+  parent_data_pointer = data_buffs[data_src].payload;
+
+  for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
+
+    if(IS_PEER_READY(parent_ctl_pointer,ready_flag,sequence_number, BCAST_FLAG, bcol_id)) {
+      matched = 1;
+      break;
+    }
+  }
+
+  /* If not matched, then hop out and put me on progress list */
+  if(0 == matched ) {
+    BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
+    return BCOL_FN_NOT_STARTED;
+  }
+
+  /* else, we found our root within the group ... */
+  BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", data_src));
+
+  /* copy the data */
+  memcpy(data_addr, (void *) parent_data_pointer, pack_len);
+  /* set the memory barrier to ensure completion */
+  MB();
+  /* signal that I am done */
+  my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
+
+
+ Release:
+  my_ctl_pointer->starting_flag_value[bcol_id]++;
+  return BCOL_FN_COMPLETE;
 }
 
 
@@ -227,12 +229,12 @@ Release:
  * Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers.
  * This routine assumes that buf (the input buffer) is a single writer
  * multi reader (SWMR) shared memory buffer owned by the calling rank
- * which is the only rank that can write to this buffers. 
+ * which is the only rank that can write to this buffers.
  * It is also assumed that the buffers are registered and fragmented
  * at the ML level and that buf is sufficiently large to hold the data.
  *
  *
- * @param buf - SWMR shared buffer within a sbgp that the 
+ * @param buf - SWMR shared buffer within a sbgp that the
  * executing rank can write to.
  * @param count - the number of elements in the shared buffer.
  * @param dtype - the datatype of a shared buffer element.
@@ -240,623 +242,622 @@ Release:
  * @param module - basesmuma module.
  */
 int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args)
+                                          coll_ml_function_t *c_input_args)
 {
-    /* local variables */
-    mca_bcol_basesmuma_module_t* bcol_module=
-        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
-    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
-    int i;
-    int group_size;
-    int my_rank; 
-    int leading_dim, buff_idx, idx;
-    int count=input_args->count;
-    struct ompi_datatype_t* dtype=input_args->dtype;
-    int64_t sequence_number=input_args->sequence_num;
-    int radix = cs->k_nomial_radix;
-    int radix_mask;
-    int relative_rank;
-    int pow_k_group_size;
+  /* local variables */
+  mca_bcol_basesmuma_module_t* bcol_module=
+    (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+  mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+  int i;
+  int group_size;
+  int my_rank;
+  int leading_dim, buff_idx, idx;
+  int count=input_args->count;
+  struct ompi_datatype_t* dtype=input_args->dtype;
+  int64_t sequence_number=input_args->sequence_num;
+  int radix = cs->k_nomial_radix;
+  int radix_mask;
+  int relative_rank;
+  int pow_k_group_size;
 
-	volatile int8_t ready_flag;
-    int bcol_id = (int) bcol_module->super.bcol_id;
-    volatile mca_bcol_basesmuma_payload_t *data_buffs;
-    volatile void* parent_data_pointer;
-    
-    volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
-    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
-    
-    size_t pack_len = 0, dt_size;
-    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr + 
-                   input_args->sbuf_offset);
+  volatile int8_t ready_flag;
+  int bcol_id = (int) bcol_module->super.bcol_id;
+  volatile mca_bcol_basesmuma_payload_t *data_buffs;
+  volatile void* parent_data_pointer;
 
-#if 0 
-    fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
-    fflush(stderr); 
+  volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
+  volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+
+  size_t pack_len = 0;
+  void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr +
+                             input_args->sbuf_offset);
+
+#if 0
+  fprintf(stderr,"Entering nb-sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
+  fflush(stderr);
 #endif
 
 
 
-    /* we will work only on packed data - so compute the length*/
-    ompi_datatype_type_size(dtype, &dt_size);
-    pack_len=count*dt_size;
+  /* we will work only on packed data - so compute the length*/
+  pack_len = mca_bcol_base_get_buff_length(dtype, count);
 
-    buff_idx = input_args->buffer_index;
+  buff_idx = input_args->buffer_index;
 
-    /* Get addressing information */ 
-    my_rank = bcol_module->super.sbgp_partner_module->my_index;
-    group_size = bcol_module->colls_no_user_data.size_of_group;
-    leading_dim=bcol_module->colls_no_user_data.size_of_group;
-    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+  /* Get addressing information */
+  my_rank = bcol_module->super.sbgp_partner_module->my_index;
+  group_size = bcol_module->colls_no_user_data.size_of_group;
+  leading_dim=bcol_module->colls_no_user_data.size_of_group;
+  idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
 
-    /* get pow_k_levels and pow_k_group_size */
-    pow_k_group_size = bcol_module->pow_k;
-    
-    
-    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
-        bcol_module->colls_with_user_data.data_buffs+idx;
-     
-    /* Set pointer to current proc ctrl region */
-    my_ctl_pointer = data_buffs[my_rank].ctl_struct; 
-   
-    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
+  /* get pow_k_levels and pow_k_group_size */
+  pow_k_group_size = bcol_module->pow_k;
 
-    /* non-blocking broadcast algorithm */
 
-    /* If I am the root, then signal ready flag */
-    if(input_args->root_flag) {
-     
-        BASESMUMA_VERBOSE(10,("I am the root of the data"));
-        /*
-         * set the radix_mask */
-        radix_mask = pow_k_group_size;
-        /* send to children */
-        MB();
-        BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
-                radix,0,
-                my_rank,group_size, ready_flag);
-        /* root is finished */
-        goto Release;
-    }
+  data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+    bcol_module->colls_with_user_data.data_buffs+idx;
 
-    /* If I am not the root, then poll on possible "senders'" control structs */
-    for( i = 0; i < cs->num_to_probe; i++) {
-        
-        if( ready_flag == my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
+  /* Set pointer to current proc ctrl region */
+  my_ctl_pointer = data_buffs[my_rank].ctl_struct;
 
-            /* else, we found our root within the group ... */
-            parent_data_pointer = data_buffs[my_ctl_pointer->src].payload;
-            BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,my_ctl_pointer->src));
-            /* memcopy the data */
-            memcpy(data_addr, (void *) parent_data_pointer, pack_len);
-            /* compute my relative rank */
-            relative_rank = (my_rank - my_ctl_pointer->src) < 0 ? my_rank -
-                my_ctl_pointer->src + group_size : my_rank - my_ctl_pointer->src;
+  BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
 
-            /* compute my radix mask */
-            radix_mask = 1;
-            while(radix_mask < group_size ){
-                if( 0 != relative_rank % (radix*radix_mask)) {
-                    /* found it */
-                    break;
-                }
-                radix_mask *= radix;
-            }
-            /* go one step back */
-            radix_mask /= radix;
+  /* non-blocking broadcast algorithm */
 
-            /* send to children */
-            MB();
-            BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask, 
-                    radix, relative_rank,
-                    my_rank, group_size, ready_flag);
-            /* bail */
+  /* If I am the root, then signal ready flag */
+  if(input_args->root_flag) {
 
-            goto Release;
+    BASESMUMA_VERBOSE(10,("I am the root of the data"));
+    /*
+     * set the radix_mask */
+    radix_mask = pow_k_group_size;
+    /* send to children */
+    MB();
+    BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
+                                     radix,0,
+                                     my_rank,group_size, ready_flag);
+    /* root is finished */
+    goto Release;
+  }
+
+  /* If I am not the root, then poll on possible "senders'" control structs */
+  for( i = 0; i < cs->num_to_probe; i++) {
+
+    if( ready_flag == my_ctl_pointer->flags[BCAST_FLAG][bcol_id]) {
+
+      /* else, we found our root within the group ... */
+      parent_data_pointer = data_buffs[my_ctl_pointer->src].payload;
+      BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,my_ctl_pointer->src));
+      /* memcopy the data */
+      memcpy(data_addr, (void *) parent_data_pointer, pack_len);
+      /* compute my relative rank */
+      relative_rank = (my_rank - my_ctl_pointer->src) < 0 ? my_rank -
+        my_ctl_pointer->src + group_size : my_rank - my_ctl_pointer->src;
+
+      /* compute my radix mask */
+      radix_mask = 1;
+      while(radix_mask < group_size ){
+        if( 0 != relative_rank % (radix*radix_mask)) {
+          /* found it */
+          break;
         }
-                
+        radix_mask *= radix;
+      }
+      /* go one step back */
+      radix_mask /= radix;
+
+      /* send to children */
+      MB();
+      BASESMUMA_K_NOMIAL_SEND_CHILDREN(radix_mask,
+                                       radix, relative_rank,
+                                       my_rank, group_size, ready_flag);
+      /* bail */
+
+      goto Release;
     }
 
-
-
-    /* If not matched, then hop out and put me on progress list */
-    BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
-    /*fprintf(stderr,"bcol_id %d Not started\n",bcol_id);*/
-    return BCOL_FN_NOT_STARTED;
+  }
 
 
 
-Release:
-   
+  /* If not matched, then hop out and put me on progress list */
+  BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
+  /*fprintf(stderr,"bcol_id %d Not started\n",bcol_id);*/
+  return BCOL_FN_NOT_STARTED;
 
-   my_ctl_pointer->starting_flag_value[bcol_id]++;
 
-   return BCOL_FN_COMPLETE;
+
+ Release:
+
+
+  my_ctl_pointer->starting_flag_value[bcol_id]++;
+
+  return BCOL_FN_COMPLETE;
 }
 
 
-/* non-blocking binary scatter allgather anyroot algorithm for large data 
- * broadcast 
+/* non-blocking binary scatter allgather anyroot algorithm for large data
+ * broadcast
  */
 
 
 #if 0
 /* prototype code for shared memory scatter/allgather algorithm. Signaling scheme
- * works, should be used as a reference for other types of shared memory scatter/allgather 
+ * works, should be used as a reference for other types of shared memory scatter/allgather
  * algorithms.
- */ 
+ */
 int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
-    coll_ml_function_t *c_input_args)
+                                                    coll_ml_function_t *c_input_args)
 {
 
-    /* local variables */
-    int i, j;
-    int length;
-    int start;
-    int my_rank, parent_rank;
-    int partner;
-    int src = -1; 
-    int matched = 0;
-    int group_size;
-    int first_instance=0;
-    int leading_dim, buff_idx, idx;
-    int64_t sequence_number=input_args->sequence_num;
-    
-    int64_t ready_flag;
-    int64_t local_offset;
-    
-    int flag_offset;
-    int pow_2, pow_2_levels;
-    int index = -1;
+  /* local variables */
+  int i, j;
+  int length;
+  int start;
+  int my_rank, parent_rank;
+  int partner;
+  int src = -1;
+  int matched = 0;
+  int group_size;
+  int first_instance=0;
+  int leading_dim, buff_idx, idx;
+  int64_t sequence_number=input_args->sequence_num;
 
-    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
-    mca_bcol_basesmuma_module_t *bcol_module =
-        (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
-    /* use the old control structs for large messages,
-     * otherwise we will destroy the shared memory 
-     * optimization
-     */
-    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
-    mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer;
-    mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer; /* binomial fanout */
-    mca_bcol_basesmuma_ctl_struct_t  *partner_ctl_pointer; /* recursive double */
+  int64_t ready_flag;
+  int64_t local_offset;
 
-    /* for now, we use the payload buffer for single fragment */
-    volatile mca_bcol_basesmuma_payload_t *data_buffs;
-    volatile void *parent_data_pointer; /* binomial scatter */
-    volatile void *partner_data_pointer;  /* recursive double */
+  int flag_offset;
+  int pow_2, pow_2_levels;
+  int index = -1;
 
-    uint32_t fragment_size;  /* ml buffer size for now */
+  mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+  mca_bcol_basesmuma_module_t *bcol_module =
+    (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+  /* use the old control structs for large messages,
+   * otherwise we will destroy the shared memory
+   * optimization
+   */
+  mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+  mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer;
+  mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer; /* binomial fanout */
+  mca_bcol_basesmuma_ctl_struct_t  *partner_ctl_pointer; /* recursive double */
 
-    /* we will transfer the entire buffer, 
-     * so start at the base address of the ml buffer 
-     */
-    void *data_addr = (void *) ((unsigned char *) input_args->src_desc->base_data_addr);
-#if 0 
-    fprintf(stderr,"AAA Entering nb-sm large msg broadcast input_args->frag_size %d \n",input_args->frag_size);
-    fflush(stderr); 
+  /* for now, we use the payload buffer for single fragment */
+  volatile mca_bcol_basesmuma_payload_t *data_buffs;
+  volatile void *parent_data_pointer; /* binomial scatter */
+  volatile void *partner_data_pointer;  /* recursive double */
+
+  uint32_t fragment_size;  /* ml buffer size for now */
+
+  /* we will transfer the entire buffer,
+   * so start at the base address of the ml buffer
+   */
+  void *data_addr = (void *) ((unsigned char *) input_args->src_desc->base_data_addr);
+#if 0
+  fprintf(stderr,"AAA Entering nb-sm large msg broadcast input_args->frag_size %d \n",input_args->frag_size);
+  fflush(stderr);
 #endif
 
-    buff_idx = input_args->src_desc->buffer_index;
+  buff_idx = input_args->src_desc->buffer_index;
 
-    group_size = bcol_module->colls_no_user_data.size_of_group;
-    leading_dim=bcol_module->colls_no_user_data.size_of_group;
-    
-    /* get the largest power of two that is smaller than
-     * or equal to the group size
+  group_size = bcol_module->colls_no_user_data.size_of_group;
+  leading_dim=bcol_module->colls_no_user_data.size_of_group;
+
+  /* get the largest power of two that is smaller than
+   * or equal to the group size
+   */
+  pow_2_levels = bcol_module->pow_2_levels;
+  pow_2 = bcol_module->pow_2;
+
+  /* get the fragment size
+   */
+
+  /* still just the size of the entire buffer */
+  fragment_size = input_args->buffer_size;
+  idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+  my_rank = bcol_module->super.sbgp_partner_module->my_index;
+
+
+  /* grab the control structs */
+  ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
+    bcol_module->colls_with_user_data.ctl_buffs+idx;
+
+  /* grab the data buffs */
+  data_buffs = (mca_bcol_basesmuma_payload_t *)
+    bcol_module->colls_with_user_data.data_buffs+idx;
+
+  my_ctl_pointer = ctl_structs[my_rank];
+
+  if(my_ctl_pointer->sequence_number < sequence_number) {
+    first_instance = 1;
+  }
+
+  if(first_instance) {
+    my_ctl_pointer->flag = -1;
+    my_ctl_pointer->index = 1;
+
+    my_ctl_pointer->starting_flag_value = 0;
+
+    flag_offset = 0;
+
+  } else {
+
+    my_ctl_pointer->index++;
+  }
+
+  /* increment the starting flag by one and return */
+  flag_offset = my_ctl_pointer->starting_flag_value;
+  ready_flag = flag_offset + sequence_number + 1;
+
+  my_ctl_pointer->sequence_number = sequence_number;
+
+  /* am I the root */
+  if(input_args->root_flag) {
+    /* if I've already been here, then
+     * hop down to the allgather
      */
-    pow_2_levels = bcol_module->pow_2_levels;
-    pow_2 = bcol_module->pow_2;
-
-    /* get the fragment size 
-     */
-     
-    /* still just the size of the entire buffer */
-    fragment_size = input_args->buffer_size;
-    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
-    my_rank = bcol_module->super.sbgp_partner_module->my_index;
-
-
-    /* grab the control structs */
-    ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
-        bcol_module->colls_with_user_data.ctl_buffs+idx;
-
-   /* grab the data buffs */
-   data_buffs = (mca_bcol_basesmuma_payload_t *)
-        bcol_module->colls_with_user_data.data_buffs+idx;
-
-    my_ctl_pointer = ctl_structs[my_rank];
-
-    if(my_ctl_pointer->sequence_number < sequence_number) {
-        first_instance = 1;
+    if(ALLGATHER == my_ctl_pointer->status) {
+      goto Allgather;
     }
+    BASESMUMA_VERBOSE(10,("I am the root of the data"));
+    /* debug print */
+    /*fprintf(stderr,"I am the root %d\n",my_rank);*/
+    /*
+     * signal ready flag
+     */
+    /* set the offset into the buffer */
+    my_ctl_pointer->offset = 0;
+    /* how many children do I have */
+    my_ctl_pointer->n_sends = pow_2_levels;
+    /* my data length */
+    my_ctl_pointer->length = fragment_size;
 
-    if(first_instance) {
-        my_ctl_pointer->flag = -1;
-        my_ctl_pointer->index = 1;
+    /* important that these be set before my children
+     * see the ready flag raised
+     */
+    MB();
+    my_ctl_pointer->flag = ready_flag;
 
-        my_ctl_pointer->starting_flag_value = 0;
-
-        flag_offset = 0;
+    /* root is finished */
+    if( my_rank < pow_2 ) {
+      /* if I'm in the power of two group,
+       * then goto the allgather
+       */
+      my_ctl_pointer->status = ALLGATHER;
+      goto Allgather;
 
     } else {
 
-        my_ctl_pointer->index++;
+      /* if I'm not, then I'm done and release */
+      goto Release;
     }
 
-    /* increment the starting flag by one and return */
-    flag_offset = my_ctl_pointer->starting_flag_value;
-    ready_flag = flag_offset + sequence_number + 1;
-    
-    my_ctl_pointer->sequence_number = sequence_number;
+  }
 
-    /* am I the root */
-    if(input_args->root_flag) {
-       /* if I've already been here, then
-        * hop down to the allgather 
-        */
-       if(ALLGATHER == my_ctl_pointer->status) {
-          goto Allgather;
-       } 
-        BASESMUMA_VERBOSE(10,("I am the root of the data"));
-        /* debug print */
-        /*fprintf(stderr,"I am the root %d\n",my_rank);*/
-        /*
-         * signal ready flag
-         */
-        /* set the offset into the buffer */
-        my_ctl_pointer->offset = 0; 
-        /* how many children do I have */
-        my_ctl_pointer->n_sends = pow_2_levels;
-        /* my data length */
-        my_ctl_pointer->length = fragment_size;
+  /* what phase am I participating in
+   */
+  switch(my_ctl_pointer->status) {
 
-        /* important that these be set before my children 
-         * see the ready flag raised
-         */
-        MB();
-        my_ctl_pointer->flag = ready_flag;
+  case SCATTER:
+    goto Scatter;
+    break;
 
-        /* root is finished */
-        if( my_rank < pow_2 ) {
-            /* if I'm in the power of two group,
-             * then goto the allgather 
-             */
-            my_ctl_pointer->status = ALLGATHER;
-            goto Allgather;
+  case ALLGATHER:
+    goto Allgather;
+    break;
 
-        } else {
+  case EXTRA_RANK:
+    goto Extra;
+    break;
 
-            /* if I'm not, then I'm done and release */
-            goto Release;
-        }
+  default:
+    break;
+  }
 
-    }
 
-    /* what phase am I participating in  
+ Extra:
+  /* am I part of the non-power-of-2 group */
+  if( my_rank >= pow_2 ) {
+    /* find parent to copy from */
+    parent_rank = my_rank&(pow_2-1);
+    parent_ctl_pointer = ctl_structs[parent_rank];
+    /* start at the base */
+    parent_data_pointer = (void *) data_buffs[parent_rank].ctl_struct;
+
+    /* now, I need to do some arithmetic to
+     * arrive at the value everyone else does
+     * when they have completed the algorithm
      */
-    switch(my_ctl_pointer->status) {
 
-        case SCATTER:
-            goto Scatter;
-            break;
+    /* compute ready flag value to poll on */
+    ready_flag = ready_flag + pow_2_levels;
 
-        case ALLGATHER:
-            goto Allgather;
-            break;
-
-        case EXTRA_RANK:
-            goto Extra;
-            break;
-
-        default:
-            break;
+    /* start to poll */
+    for( i = 0; i< cs->num_to_probe; i++) {
+      if(IS_LARGE_DATA_READY(parent_ctl_pointer,ready_flag, sequence_number)) {
+        /* copy the data and bail */
+        memcpy(data_addr,(void *)parent_data_pointer,fragment_size);
+        goto Release;
+      }
+      /*
+         else {
+         opal_progress();
+         }
+      */
     }
+    my_ctl_pointer->status = EXTRA_RANK;
 
+    /* hop out and put me onto a progress queue */
+    return BCOL_FN_NOT_STARTED;
+  }
 
-Extra: 
-    /* am I part of the non-power-of-2 group */
-    if( my_rank >= pow_2 ) {
-        /* find parent to copy from */
-        parent_rank = my_rank&(pow_2-1);
-        parent_ctl_pointer = ctl_structs[parent_rank];
-        /* start at the base */
-        parent_data_pointer = (void *) data_buffs[parent_rank].ctl_struct;
+ Scatter:
 
-       /* now, I need to do some arithmetic to 
-        * arrive at the value everyone else does
-        * when they have completed the algorithm 
-        */
+  /* on first entry, compute the list of possible sources */
+  if( NULL == my_ctl_pointer->src_ptr ) {
+    my_ctl_pointer->src_ptr = (int *) malloc(sizeof(int)*(pow_2_levels+1));
 
-       /* compute ready flag value to poll on */
-        ready_flag = ready_flag + pow_2_levels;
-
-        /* start to poll */
-        for( i = 0; i< cs->num_to_probe; i++) {
-            if(IS_LARGE_DATA_READY(parent_ctl_pointer,ready_flag, sequence_number)) {
-                /* copy the data and bail */
-                memcpy(data_addr,(void *)parent_data_pointer,fragment_size);
-                goto Release;
-            }
-            /* 
-            else {
-                opal_progress();
-            }
-            */
-        }
-        my_ctl_pointer->status = EXTRA_RANK;
-
-        /* hop out and put me onto a progress queue */
-        return BCOL_FN_NOT_STARTED;
+    for( i = 0; i < pow_2_levels; i++) {
+      my_ctl_pointer->src_ptr[i] = my_rank ^ (1<<i);
     }
-
-Scatter:
-    
-    /* on first entry, compute the list of possible sources */
-    if( NULL == my_ctl_pointer->src_ptr ) { 
-        my_ctl_pointer->src_ptr = (int *) malloc(sizeof(int)*(pow_2_levels+1));
-    
-        for( i = 0; i < pow_2_levels; i++) {
-            my_ctl_pointer->src_ptr[i] = my_rank ^ (1<<i);
-        }
-        /* am I participating in the non-power of two */
-        if((my_rank+pow_2) < group_size) {
-            /* extra rank that I'm paired with */
-            my_ctl_pointer->src_ptr[i] = my_rank + pow_2;
-        } else {
-            /* no extra rank to worry about */
-            my_ctl_pointer->src_ptr[i] = -1;
-        }
+    /* am I participating in the non-power of two */
+    if((my_rank+pow_2) < group_size) {
+      /* extra rank that I'm paired with */
+      my_ctl_pointer->src_ptr[i] = my_rank + pow_2;
+    } else {
+      /* no extra rank to worry about */
+      my_ctl_pointer->src_ptr[i] = -1;
     }
-    
-    /* If I am not the root, then poll on possible "senders'" control structs */
-    for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
+  }
 
-        /* Shared memory iprobe */
-        BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(my_ctl_pointer->src_ptr, pow_2_levels+1, 
-                my_rank, matched, src);
-    }
+  /* If I am not the root, then poll on possible "senders'" control structs */
+  for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
 
-    /* If not matched, then hop out and put me on progress list */
-    if(0 == matched ) {
+    /* Shared memory iprobe */
+    BCOL_BASESMUMA_SM_LARGE_MSG_PROBE(my_ctl_pointer->src_ptr, pow_2_levels+1,
+                                      my_rank, matched, src);
+  }
 
-        BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
-        
-        my_ctl_pointer->status = SCATTER;
-        return BCOL_FN_NOT_STARTED;
-   
-    } else if ( src >= pow_2 ){
+  /* If not matched, then hop out and put me on progress list */
+  if(0 == matched ) {
 
-        /* If matched from an extra rank, then get the whole message from partner */
-        memcpy((void *) data_addr, (void *) parent_data_pointer,
-                parent_ctl_pointer->length);
-        
-        /* now I am the psuedo-root in the power-of-two group */
-        my_ctl_pointer->offset = 0;
-        my_ctl_pointer->length = parent_ctl_pointer->length;
-        my_ctl_pointer->n_sends = parent_ctl_pointer->n_sends;
+    BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
 
-        /* set the memory barrier */
-        MB();
-
-        /* fire the ready flag */
-        my_ctl_pointer->flag = ready_flag;
-        my_ctl_pointer->status = ALLGATHER;
-        /* go to the allgather */
-        goto Allgather;
-    }
-
-
-    /* we need to see whether this is really 
-     * who we are looking for
-     */
-    for( i = 0; i < parent_ctl_pointer->n_sends; i++) {
-        /* debug print */
-		/*	
-        fprintf(stderr,"I am %d checking on a hit from %d with n_sends %d\n",my_rank,src,parent_ctl_pointer->n_sends);
-        fflush(stderr);
-		*/
-        /* end debug */
-        if( my_rank == (src^(1<<i))) {
-
-            /* we found our root within the group ... */
-            BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
-            /* this is who I've been looking for */
-            my_ctl_pointer->n_sends = i;
-            
-            if ( i > 0) {
-                /* compute the size of the chunk to copy */ 
-                length = (parent_ctl_pointer->length)/
-                    (1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends));
-                my_ctl_pointer->length = length;
-                my_ctl_pointer->offset = 
-                    parent_ctl_pointer->offset+length;
-                
-                /*fprintf(stderr,"%d's offset %d and length %d \n",my_rank,my_ctl_pointer->offset,length);*/
-               
-				/* now we can copy the data */
-                memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset), 
-                        (void *) ((uint64_t) parent_data_pointer+(uint64_t) parent_ctl_pointer->offset +
-                            (uint64_t) length), 
-                        (size_t)length);
-            } else {
-                /* this "trick" takes care of the first level 
-                 * of recurssive doubling 
-                 */
-                length = parent_ctl_pointer->length/
-                    (1<<(parent_ctl_pointer->n_sends - 1));
-                my_ctl_pointer->length = length;
-                my_ctl_pointer->offset = parent_ctl_pointer->offset;
-                
-				/*fprintf(stderr,"%d's offset %d and length %d\n",my_rank,my_ctl_pointer->offset,length);*/
-                /* now we can copy the data */
-                memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset), 
-                        (void *) ((uint64_t) parent_data_pointer+(uint64_t) my_ctl_pointer->offset), 
-                        (size_t)length);
-            }
-            /* set the memory barrier to ensure completion */
-            MB();
-            /* signal that I am done */
-            my_ctl_pointer->flag = ready_flag; 
-            /* set my status */
-            my_ctl_pointer->status = ALLGATHER;
-            /* time for allgather phase */
-            goto Allgather;
-        } 
-
-    }
-
-    /* this is not who we are looking for, 
-     * mark as false positive so we don't 
-     * poll here again
-     */
-    my_ctl_pointer->src_ptr[index] = -1;
-    /* probably we should jump out and put onto progress list */
     my_ctl_pointer->status = SCATTER;
     return BCOL_FN_NOT_STARTED;
 
-Allgather:
+  } else if ( src >= pow_2 ){
 
-    /* zip it back up - we have already taken care of first level */
-    /* needed for non-blocking conditional */
+    /* If matched from an extra rank, then get the whole message from partner */
+    memcpy((void *) data_addr, (void *) parent_data_pointer,
+           parent_ctl_pointer->length);
+
+    /* now I am the psuedo-root in the power-of-two group */
+    my_ctl_pointer->offset = 0;
+    my_ctl_pointer->length = parent_ctl_pointer->length;
+    my_ctl_pointer->n_sends = parent_ctl_pointer->n_sends;
+
+    /* set the memory barrier */
+    MB();
+
+    /* fire the ready flag */
+    my_ctl_pointer->flag = ready_flag;
+    my_ctl_pointer->status = ALLGATHER;
+    /* go to the allgather */
+    goto Allgather;
+  }
+
+
+  /* we need to see whether this is really
+   * who we are looking for
+   */
+  for( i = 0; i < parent_ctl_pointer->n_sends; i++) {
+    /* debug print */
+    /*
+        fprintf(stderr,"I am %d checking on a hit from %d with n_sends %d\n",my_rank,src,parent_ctl_pointer->n_sends);
+        fflush(stderr);
+    */
+    /* end debug */
+    if( my_rank == (src^(1<<i))) {
+
+      /* we found our root within the group ... */
+      BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
+      /* this is who I've been looking for */
+      my_ctl_pointer->n_sends = i;
+
+      if ( i > 0) {
+        /* compute the size of the chunk to copy */
+        length = (parent_ctl_pointer->length)/
+          (1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends));
+        my_ctl_pointer->length = length;
+        my_ctl_pointer->offset =
+          parent_ctl_pointer->offset+length;
+
+        /*fprintf(stderr,"%d's offset %d and length %d \n",my_rank,my_ctl_pointer->offset,length);*/
+
+        /* now we can copy the data */
+        memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
+               (void *) ((uint64_t) parent_data_pointer+(uint64_t) parent_ctl_pointer->offset +
+                         (uint64_t) length),
+               (size_t)length);
+      } else {
+        /* this "trick" takes care of the first level
+         * of recurssive doubling
+         */
+        length = parent_ctl_pointer->length/
+          (1<<(parent_ctl_pointer->n_sends - 1));
+        my_ctl_pointer->length = length;
+        my_ctl_pointer->offset = parent_ctl_pointer->offset;
+
+        /*fprintf(stderr,"%d's offset %d and length %d\n",my_rank,my_ctl_pointer->offset,length);*/
+        /* now we can copy the data */
+        memcpy((void *) ((uint64_t) data_addr+my_ctl_pointer->offset),
+               (void *) ((uint64_t) parent_data_pointer+(uint64_t) my_ctl_pointer->offset),
+               (size_t)length);
+      }
+      /* set the memory barrier to ensure completion */
+      MB();
+      /* signal that I am done */
+      my_ctl_pointer->flag = ready_flag;
+      /* set my status */
+      my_ctl_pointer->status = ALLGATHER;
+      /* time for allgather phase */
+      goto Allgather;
+    }
+
+  }
+
+  /* this is not who we are looking for,
+   * mark as false positive so we don't
+   * poll here again
+   */
+  my_ctl_pointer->src_ptr[index] = -1;
+  /* probably we should jump out and put onto progress list */
+  my_ctl_pointer->status = SCATTER;
+  return BCOL_FN_NOT_STARTED;
+
+ Allgather:
+
+  /* zip it back up - we have already taken care of first level */
+  /* needed for non-blocking conditional */
+  matched = 0;
+
+  /* get my local_offset */
+  local_offset = my_ctl_pointer->offset;
+
+  /* bump the ready flag */
+  ready_flag++;
+
+  /* first level of zip up */
+  length = 2*fragment_size/pow_2;
+
+  /* first level of zip-up
+   * already includes first level of
+   * recursive doubling
+   */
+  start = 1;
+
+  /* for non-blocking, check to see if I need to reset the state */
+  if(my_ctl_pointer->flag >= ready_flag) {
+    /* then reset the state */
+    ready_flag = my_ctl_pointer->flag;
+    start = my_ctl_pointer->start;
+    /* get the local offset */
+    local_offset = my_ctl_pointer->offset_zip;
+    /* compute the correct length */
+    length = length*(1<<(start - 1));
+    /* careful! skip over the MB() to avoid the
+     * cost on every re-entry
+     */
+    goto Loop;
+  }
+
+
+  MB();
+  /* I am ready, set the flag */
+  my_ctl_pointer->flag = ready_flag;
+
+ Loop:
+
+  for( i = start; i < pow_2_levels; i++) {
+    /* get my partner for this level */
+    partner = my_rank^(1<<i);
+    partner_ctl_pointer = ctl_structs[partner];
+    partner_data_pointer = (void *) data_buffs[partner].ctl_struct;
+
+    /* is data ready */
+    for( j = 0; j < cs->num_to_probe && matched == 0; j++) {
+      if(IS_LARGE_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
+
+        /* debug prints
+           fprintf(stderr,"666 I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d partner_offset %d\n",
+           my_rank,sequence_number,partner, ready_flag,partner_ctl_pointer->flag,buff_idx,partner_ctl_pointer->offset);
+        */
+        /* debug print */
+#if 0
+        fprintf(stderr,"I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d \n",
+                my_rank,sequence_number,partner, ready_flag,parent_ctl_pointer->flag,buff_idx);
+#endif
+        /* end debug prints */
+
+        assert(partner_ctl_pointer->flag >= ready_flag);
+        /* found it */
+        matched = 1;
+        /* only copy it, if you sit at a lower level in the tree */
+        if( my_ctl_pointer->n_sends <= partner_ctl_pointer->n_sends ) {
+
+          /* calculate the local offset based on partner's remote offset */
+          if( partner_ctl_pointer->offset < my_ctl_pointer->offset ) {
+            /* then I'm looking "up" the tree */
+            local_offset -= length;
+            /* debug print */
+            /*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
+            /* end debug */
+            memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
+                   (void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
+                   length);
+          } else {
+            /* I'm looking "down" the tree */
+            local_offset += length;
+            /* debug print */
+            /*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
+            /* end debug */
+            memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
+                   (void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset),
+                   length);
+            /* reset my local offset */
+            local_offset -= length;
+          }
+
+        }
+        /* bump the ready flag */
+        ready_flag++;
+        /* ensure completion */
+        MB();
+
+        /* fire the flag for the next level */
+        my_ctl_pointer->flag = ready_flag;
+
+        /* double the length */
+        length *= 2;
+      }
+    }
+    /* check to see what kind of progress I've made */
+    if( 0 == matched ) {
+      /* save state, hop out and try again later */
+      my_ctl_pointer->start = i;
+      /* save the local offset */
+      my_ctl_pointer->offset_zip = local_offset;
+      /* put in progress queue */
+      return BCOL_FN_STARTED;
+    }
+    /* else, start next level of recursive doubling */
     matched = 0;
 
-    /* get my local_offset */ 
-    local_offset = my_ctl_pointer->offset;
-
-    /* bump the ready flag */
-    ready_flag++;
-
-    /* first level of zip up */
-    length = 2*fragment_size/pow_2;
-
-    /* first level of zip-up 
-     * already includes first level of
-     * recursive doubling 
-     */
-    start = 1;
-
-    /* for non-blocking, check to see if I need to reset the state */
-    if(my_ctl_pointer->flag >= ready_flag) {
-        /* then reset the state */
-        ready_flag = my_ctl_pointer->flag;
-        start = my_ctl_pointer->start;
-        /* get the local offset */
-        local_offset = my_ctl_pointer->offset_zip;
-        /* compute the correct length */
-        length = length*(1<<(start - 1));
-        /* careful! skip over the MB() to avoid the 
-         * cost on every re-entry 
-         */
-        goto Loop;
-    }
-
- 
-    MB();
-    /* I am ready, set the flag */
-    my_ctl_pointer->flag = ready_flag;
-
-Loop:
-
-    for( i = start; i < pow_2_levels; i++) {
-        /* get my partner for this level */
-        partner = my_rank^(1<<i);
-        partner_ctl_pointer = ctl_structs[partner];
-        partner_data_pointer = (void *) data_buffs[partner].ctl_struct;
-        
-        /* is data ready */
-        for( j = 0; j < cs->num_to_probe && matched == 0; j++) {
-            if(IS_LARGE_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
-
-                /* debug prints 
-                   fprintf(stderr,"666 I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d partner_offset %d\n",
-                   my_rank,sequence_number,partner, ready_flag,partner_ctl_pointer->flag,buff_idx,partner_ctl_pointer->offset);
-                 */
-                /* debug print */
-#if 0
-                fprintf(stderr,"I am %d and sequence num is %d partner is %d ready_flag %d parent ready_flag %d buff_idx %d \n",
-                        my_rank,sequence_number,partner, ready_flag,parent_ctl_pointer->flag,buff_idx);
-#endif
-                /* end debug prints */
-
-                assert(partner_ctl_pointer->flag >= ready_flag);
-                /* found it */
-                matched = 1;
-                /* only copy it, if you sit at a lower level in the tree */
-                if( my_ctl_pointer->n_sends <= partner_ctl_pointer->n_sends ) {
-
-                    /* calculate the local offset based on partner's remote offset */
-                    if( partner_ctl_pointer->offset < my_ctl_pointer->offset ) {
-                        /* then I'm looking "up" the tree */
-                        local_offset -= length;
-                        /* debug print */ 
-                        /*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
-                        /* end debug */
-                        memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
-                                (void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset), 
-                                length);
-                    } else {
-                        /* I'm looking "down" the tree */
-                        local_offset += length;
-                        /* debug print */ 
-                        /*fprintf(stderr,"I am %d and partner is %d partner offset %d length %d \n",my_rank,partner, local_offset,length);*/
-                        /* end debug */
-                        memcpy((void *) ((uint64_t) data_addr + (uint64_t) local_offset),
-                                (void *) ((uint64_t) partner_data_pointer + (uint64_t) local_offset), 
-                                length);
-                        /* reset my local offset */
-                        local_offset -= length;
-                    }
-
-                }
-                /* bump the ready flag */
-                ready_flag++;
-                /* ensure completion */
-                MB();
-
-                /* fire the flag for the next level */
-                my_ctl_pointer->flag = ready_flag;
-
-                /* double the length */
-                length *= 2;
-            }
-        }
-        /* check to see what kind of progress I've made */
-        if( 0 == matched ) {
-            /* save state, hop out and try again later */
-            my_ctl_pointer->start = i;
-            /* save the local offset */
-            my_ctl_pointer->offset_zip = local_offset;
-            /* put in progress queue */
-            return BCOL_FN_STARTED; 
-        }
-        /* else, start next level of recursive doubling */
-        matched = 0;
-
-    }
+  }
 
 
-    /* cleanup */
-    if(NULL != my_ctl_pointer->src_ptr) {
-        free(my_ctl_pointer->src_ptr);
-        my_ctl_pointer->src_ptr = NULL;
-    }
+  /* cleanup */
+  if(NULL != my_ctl_pointer->src_ptr) {
+    free(my_ctl_pointer->src_ptr);
+    my_ctl_pointer->src_ptr = NULL;
+  }
 
-Release:
+ Release:
 
-   
-    /* If I am the last instance, release the resource */
-    /*
+
+  /* If I am the last instance, release the resource */
+  /*
     if( IS_LAST_BCOL_FUNC(c_input_args)) {
-        rc = bcol_basesmuma_free_buff(
-                &(bcol_module->colls_with_user_data),
-                sequence_number);
+    rc = bcol_basesmuma_free_buff(
+    &(bcol_module->colls_with_user_data),
+    sequence_number);
     }
-    */
+  */
 
-    my_ctl_pointer->starting_flag_value++;
-    my_ctl_pointer->status = FINISHED;
-    return BCOL_FN_COMPLETE;
+  my_ctl_pointer->starting_flag_value++;
+  my_ctl_pointer->status = FINISHED;
+  return BCOL_FN_COMPLETE;
 
 }
 #endif
@@ -864,37 +865,31 @@ Release:
 #if 0
 int mca_bcol_basesmuma_bcast_binomial_scatter_allgather(void *desc)
 {
-   /* local variables */
-   int rc, n_frags_sent;
-   uint32_t stripe_number;
-   int count, count_processed;
-   size_t dt_size;
-   uint32_t n_data_segments_to_schedule;
-   ompi_datatype_t *dtype;
-   message_descriptor_t *message_descriptor;
-   mca_bcol_basesmuma_module_t *bcol_module;
-   int pipe_depth;
+  /* local variables */
+  int rc, n_frags_sent;
+  uint32_t stripe_number;
+  int count, count_processed;
+  size_t dt_size;
+  uint32_t n_data_segments_to_schedule;
+  ompi_datatype_t *dtype;
+  message_descriptor_t *message_descriptor;
+  mca_bcol_basesmuma_module_t *bcol_module;
+  int pipe_depth;
 
 
-   /* get the full message descriptor */
+  /* get the full message descriptor */
 
 
-   /* compute the number of fragments to send */
+  /* compute the number of fragments to send */
 
 
-   /* start to fill the pipeline */
+  /* start to fill the pipeline */
 
 
-   return OMPI_SUCCESS;
+  return OMPI_SUCCESS;
 
 
 
 
 }
 #endif
-
-
-
-
-
-  
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_buf_mgmt.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_buf_mgmt.c
index 6f756f8b4a..26803a5f61 100644
--- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_buf_mgmt.c
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_buf_mgmt.c
@@ -1,7 +1,8 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
- * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
- * Copyright (c) 2012      Los Alamos National Security, LLC.
+ * Copyright (c) 2013-2014 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2014 Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
@@ -40,7 +41,7 @@
  * correct generation of the bank is ready for use.
  */
 int bcol_basesmuma_get_buff_index( sm_buffer_mgmt *buff_block,
-    uint64_t buff_id )
+                                   uint64_t buff_id )
 {
     /* local variables */
     int memory_bank;
@@ -74,11 +75,11 @@ int bcol_basesmuma_get_buff_index( sm_buffer_mgmt *buff_block,
     return index;
 }
 
-/* release the shared memory buffers 
+/* release the shared memory buffers
  *  buf_id is the unique ID assigned to the particular buffer
  */
 int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
-    uint64_t buff_id )
+                              uint64_t buff_id )
 {
     /* local variables */
     int ret=OMPI_SUCCESS;
@@ -99,98 +100,98 @@ int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
     assert(generation == buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter);
 
     /*
-     * increment counter of completed buffers 
+     * increment counter of completed buffers
      */
     OPAL_THREAD_ADD32(&(buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed),
-      1);
+                      1);
 
     /*
      * If I am the last to checkin - initiate resource recycling
      */
-     if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed ==
-         buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
+    if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed ==
+        buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
 
-         /* Lock to ensure atomic recycling of resources */
-         OPAL_THREAD_LOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
-         
-         /* make sure someone else did not already get to this */
-         if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed !=
-         buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
-             /* release lock and exit */
-             OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
-         } else {
-			  sm_nbbar_desc_t *p_sm_nb_desc = NULL;
-             /* initiate the freeing of resources.  Need to make sure the other
-              * ranks in the group are also done with their resources before this
-              * block is made available for use again.
-              * No one else will try to allocate from this block or free back to
-              * this block until the next genration counter has been incremented,
-              * so will just reset the number of freed buffers to 0, so no one else
-              * will try to also initialize the recycling of these resrouces
-              */
-              buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed=0;
+        /* Lock to ensure atomic recycling of resources */
+        OPAL_THREAD_LOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
 
-             /* Start the nonblocking barrier */
-			 p_sm_nb_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
-			 p_sm_nb_desc->coll_buff = buff_block;
-             bcol_basesmuma_rd_nb_barrier_init_admin(p_sm_nb_desc);
-             
-			 if( NB_BARRIER_DONE !=
-                  buff_block->ctl_buffs_mgmt[memory_bank].
-                      nb_barrier_desc.collective_phase) {
+        /* make sure someone else did not already get to this */
+        if( buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed !=
+            buff_block->ctl_buffs_mgmt[memory_bank].number_of_buffers ) {
+            /* release lock and exit */
+            OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
+        } else {
+            sm_nbbar_desc_t *p_sm_nb_desc = NULL;
+            /* initiate the freeing of resources.  Need to make sure the other
+             * ranks in the group are also done with their resources before this
+             * block is made available for use again.
+             * No one else will try to allocate from this block or free back to
+             * this block until the next genration counter has been incremented,
+             * so will just reset the number of freed buffers to 0, so no one else
+             * will try to also initialize the recycling of these resrouces
+             */
+            buff_block->ctl_buffs_mgmt[memory_bank].n_buffs_freed=0;
 
-                  opal_list_t *list=&(cs->nb_admin_barriers);
-                  opal_list_item_t *append_item;
+            /* Start the nonblocking barrier */
+            p_sm_nb_desc = &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
+            p_sm_nb_desc->coll_buff = buff_block;
+            bcol_basesmuma_rd_nb_barrier_init_admin(p_sm_nb_desc);
 
-                  /* put this onto the progression list */
-                  OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex));
-                  append_item=(opal_list_item_t *)
-                      &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
-                  opal_list_append(list,append_item);
-                  OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex));
-                  /* progress communications so that resources can be freed up */
-                  opal_progress();
-              } else {
-                  /* mark the block as available */
-                  (buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++;
-              }
-                 
-             /* get out of here */
-             OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
-         }
+            if( NB_BARRIER_DONE !=
+                buff_block->ctl_buffs_mgmt[memory_bank].
+                nb_barrier_desc.collective_phase) {
 
-     }
+                opal_list_t *list=&(cs->nb_admin_barriers);
+                opal_list_item_t *append_item;
+
+                /* put this onto the progression list */
+                OPAL_THREAD_LOCK(&(cs->nb_admin_barriers_mutex));
+                append_item=(opal_list_item_t *)
+                    &(buff_block->ctl_buffs_mgmt[memory_bank].nb_barrier_desc);
+                opal_list_append(list,append_item);
+                OPAL_THREAD_UNLOCK(&(cs->nb_admin_barriers_mutex));
+                /* progress communications so that resources can be freed up */
+                opal_progress();
+            } else {
+                /* mark the block as available */
+                (buff_block->ctl_buffs_mgmt[memory_bank].bank_gen_counter)++;
+            }
+
+            /* get out of here */
+            OPAL_THREAD_UNLOCK(&(buff_block->ctl_buffs_mgmt[memory_bank].mutex));
+        }
+
+    }
 
     /* return */
     return ret;
 }
 
-#if 0 
-/* Basesmuma interface function used for buffer bank resource recycling and 
+#if 0
+/* Basesmuma interface function used for buffer bank resource recycling and
    bcol specific registration information
- */
+*/
 int bcol_basesmuma_bank_init(struct mca_coll_ml_module_t *ml_module,
-		mca_bcol_base_module_t *bcol_module,
-		void *reg_data)
+                             mca_bcol_base_module_t *bcol_module,
+                             void *reg_data)
 {
-	/* assumption here is that the block has been registered with 
-	 * sm bcol hence has been mapped by each process, need to be
-	 * sure that memory is mapped amongst sm peers 
-	 */
-	
-	/* local variables */
-	int ret = OMPI_SUCCESS, i;
+    /* assumption here is that the block has been registered with
+     * sm bcol hence has been mapped by each process, need to be
+     * sure that memory is mapped amongst sm peers
+     */
+
+    /* local variables */
+    int ret = OMPI_SUCCESS, i;
     uint32_t j;
-	sm_buffer_mgmt *pload_mgmt;
-	mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
-	bcol_basesmuma_registration_data_t *sm_reg_data =
-		(bcol_basesmuma_registration_data_t *) reg_data;
-	mca_bcol_basesmuma_module_t *sm_bcol =
-		(mca_bcol_basesmuma_module_t *) bcol_module;
-	ml_memory_block_desc_t *ml_block = 
-		ml_module->payload_block;
-	size_t malloc_size;
-	ompi_common_sm_file_t input_file;
+    sm_buffer_mgmt *pload_mgmt;
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    bcol_basesmuma_registration_data_t *sm_reg_data =
+        (bcol_basesmuma_registration_data_t *) reg_data;
+    mca_bcol_basesmuma_module_t *sm_bcol =
+        (mca_bcol_basesmuma_module_t *) bcol_module;
+    ml_memory_block_desc_t *ml_block =
+        ml_module->payload_block;
+    size_t malloc_size;
+    ompi_common_sm_file_t input_file;
     uint64_t mem_offset;
     int leading_dim,loop_limit,buf_id;
     unsigned char *base_ptr;
@@ -199,56 +200,56 @@ int bcol_basesmuma_bank_init(struct mca_coll_ml_module_t *ml_module,
 
     fprintf(stderr,"test opti test\n");
 
-	/* first, we get a pointer to the payload buffer management struct */
-	pload_mgmt = &(sm_bcol->colls_with_user_data);
+    /* first, we get a pointer to the payload buffer management struct */
+    pload_mgmt = &(sm_bcol->colls_with_user_data);
 
-	/* allocate memory for pointers to mine and my peers' payload buffers 
-	 */
-	malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
-		pload_mgmt->size_of_group *sizeof(void *);
-	pload_mgmt->data_buffs = malloc(malloc_size);
-	if( !pload_mgmt->data_buffs) {
-		ret = OMPI_ERR_OUT_OF_RESOURCE;
-		goto exit_ERROR;
-	}
+    /* allocate memory for pointers to mine and my peers' payload buffers
+     */
+    malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
+        pload_mgmt->size_of_group *sizeof(void *);
+    pload_mgmt->data_buffs = malloc(malloc_size);
+    if( !pload_mgmt->data_buffs) {
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto exit_ERROR;
+    }
 
-	/* setup the input file for the shared memory connection manager */
-	input_file.file_name = sm_reg_data->file_name;
-	input_file.size = sm_reg_data->size;
-	input_file.size_ctl_structure = 0;
-	input_file.data_seg_alignment = CACHE_LINE_SIZE;
-	input_file.mpool_size = sm_reg_data->size;
+    /* setup the input file for the shared memory connection manager */
+    input_file.file_name = sm_reg_data->file_name;
+    input_file.size = sm_reg_data->size;
+    input_file.size_ctl_structure = 0;
+    input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE;
+    input_file.mpool_size = sm_reg_data->size;
 
-	/* call the connection manager and map my shared memory peers' file
-	 */
-	ret = ompi_common_smcm_allgather_connection(
-			sm_bcol,
-			sm_bcol->super.sbgp_partner_module,
-			&(cs->sm_connections_list),
-			&(sm_bcol->payload_backing_files_info),
-			sm_bcol->super.sbgp_partner_module->group_comm,
-			input_file,
-			false);
-	if( OMPI_SUCCESS != ret ) {
-		goto exit_ERROR;
-	}
+    /* call the connection manager and map my shared memory peers' file
+     */
+    ret = ompi_common_smcm_allgather_connection(
+        sm_bcol,
+        sm_bcol->super.sbgp_partner_module,
+        &(cs->sm_connections_list),
+        &(sm_bcol->payload_backing_files_info),
+        sm_bcol->super.sbgp_partner_module->group_comm,
+        input_file,
+        false);
+    if( OMPI_SUCCESS != ret ) {
+        goto exit_ERROR;
+    }
 
-	/* now we exchange offset info - don't assume symmetric virtual memory
-	 */
-       mem_offset = (uint64_t)(ml_block->block->base_addr) -
-			(uint64_t)(cs->sm_payload_structs->data_addr);
+    /* now we exchange offset info - don't assume symmetric virtual memory
+     */
+    mem_offset = (uint64_t)(ml_block->block->base_addr) -
+        (uint64_t)(cs->sm_payload_structs->data_addr);
 
-       /* call into the exchange offsets function */
-       ret = base_bcol_basesmuma_exchange_offsets(sm_bcol_module, 
-		       (void **)pload_mgmt->data_buffs, mem_offset, 0,
-		       pload_mgmt->size_of_group);
-    	if( OMPI_SUCCESS != ret ) {
-	    	goto exit_ERROR;
-       	}
-	
-	/* convert memory offset to virtual address in current rank */
-	leading_dim = pload_mgmt->size_of_group;
-	loop_limit =  ml_block->num_banks*ml_block->num_buffers_per_bank;
+    /* call into the exchange offsets function */
+    ret = base_bcol_basesmuma_exchange_offsets(sm_bcol_module,
+                                               (void **)pload_mgmt->data_buffs, mem_offset, 0,
+                                               pload_mgmt->size_of_group);
+    if( OMPI_SUCCESS != ret ) {
+        goto exit_ERROR;
+    }
+
+    /* convert memory offset to virtual address in current rank */
+    leading_dim = pload_mgmt->size_of_group;
+    loop_limit =  ml_block->num_banks*ml_block->num_buffers_per_bank;
     for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {
 
         /* get the base pointer */
@@ -266,7 +267,7 @@ int bcol_basesmuma_bank_init(struct mca_coll_ml_module_t *ml_module,
             int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
             array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
             pload_mgmt->data_buffs[array_id]=(void *) ((uint64_t)(pload_mgmt->data_buffs[array_id_m1])+
-                    (uint64_t)ml_block->size_buffer);
+                                                       (uint64_t)ml_block->size_buffer);
         }
     }
 
@@ -287,17 +288,17 @@ exit_ERROR:
 #endif
 
 /*
- * Allocate buffers for storing non-blocking collective descriptions, required 
+ * Allocate buffers for storing non-blocking collective descriptions, required
  * for making code re-entrant
  *
  */
-static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc, 
-				void *base_addr, uint32_t num_banks, 
-				uint32_t num_buffers_per_bank, 
-				uint32_t size_buffer, 
-				uint32_t header_size, 
-				int group_size, 
-				int pow_k)
+static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc,
+                                  void *base_addr, uint32_t num_banks,
+                                  uint32_t num_buffers_per_bank,
+                                  uint32_t size_buffer,
+                                  uint32_t header_size,
+                                  int group_size,
+                                  int pow_k)
 {
     uint32_t i, j, ci;
     mca_bcol_basesmuma_nb_coll_buff_desc_t *tmp_desc = NULL;
@@ -333,104 +334,103 @@ static int init_nb_coll_buff_desc(mca_bcol_basesmuma_nb_coll_buff_desc_t **desc,
 
 
 #if 1
-/* New init function used for new control scheme where we put the control 
- * struct at the top of the payload buffer 
+/* New init function used for new control scheme where we put the control
+ * struct at the top of the payload buffer
  */
 int bcol_basesmuma_bank_init_opti(struct mca_coll_ml_module_t *ml_module,
-		mca_bcol_base_module_t *bcol_module,
-		void *reg_data)
+                                  mca_bcol_base_module_t *bcol_module,
+                                  void *reg_data)
 {
-	/* assumption here is that the block has been registered with 
-	 * sm bcol hence has been mapped by each process, need to be
-	 * sure that memory is mapped amongst sm peers 
-	 */
-	
-	/* local variables */
-	int ret = OMPI_SUCCESS, i, j;
-	sm_buffer_mgmt *pload_mgmt;
-	mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
-	bcol_basesmuma_registration_data_t *sm_reg_data =
-		(bcol_basesmuma_registration_data_t *) reg_data;
-	mca_bcol_basesmuma_module_t *sm_bcol =
-		(mca_bcol_basesmuma_module_t *) bcol_module;
-	ml_memory_block_desc_t *ml_block = 
-		ml_module->payload_block;
-	size_t malloc_size;
-	bcol_basesmuma_smcm_file_t input_file;
+    /* assumption here is that the block has been registered with
+     * sm bcol hence has been mapped by each process, need to be
+     * sure that memory is mapped amongst sm peers
+     */
+
+    /* local variables */
+    int ret = OMPI_SUCCESS, i, j, k, l;
+    sm_buffer_mgmt *pload_mgmt;
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    bcol_basesmuma_registration_data_t *sm_reg_data =
+        (bcol_basesmuma_registration_data_t *) reg_data;
+    mca_bcol_basesmuma_module_t *sm_bcol =
+        (mca_bcol_basesmuma_module_t *) bcol_module;
+    ml_memory_block_desc_t *ml_block =
+        ml_module->payload_block;
+    size_t malloc_size;
+    bcol_basesmuma_smcm_file_t input_file;
     uint64_t mem_offset;
     int leading_dim,loop_limit,buf_id;
     unsigned char *base_ptr;
     mca_bcol_basesmuma_module_t *sm_bcol_module=
-            (mca_bcol_basesmuma_module_t *)bcol_module;
+        (mca_bcol_basesmuma_module_t *)bcol_module;
     int my_idx, array_id;
     mca_bcol_basesmuma_header_t *ctl_ptr;
     void **results_array;
 
-	mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem;
+    mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem;
 
-	/* first, we get a pointer to the payload buffer management struct */
-	pload_mgmt = &(sm_bcol->colls_with_user_data);
+    /* first, we get a pointer to the payload buffer management struct */
+    pload_mgmt = &(sm_bcol->colls_with_user_data);
 
     /* go ahead and get the header size that is cached on the payload block
      */
     sm_bcol->total_header_size = ml_module->data_offset;
 
-	/* allocate memory for pointers to mine and my peers' payload buffers 
-     * difference here is that now we use our new data struct 
-	 */
-	malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
-		pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t);
-	pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size);
-	if( !pload_mgmt->data_buffs) {
-		ret = OMPI_ERR_OUT_OF_RESOURCE;
-		goto exit_ERROR;
-	}
+    /* allocate memory for pointers to mine and my peers' payload buffers
+     * difference here is that now we use our new data struct
+     */
+    malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
+        pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t);
+    pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size);
+    if( !pload_mgmt->data_buffs) {
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto exit_ERROR;
+    }
 
     /* allocate some memory to hold the offsets */
     results_array = (void **) malloc(pload_mgmt->size_of_group*sizeof(void *));
 
-	/* setup the input file for the shared memory connection manager */
-	input_file.file_name = sm_reg_data->file_name;
-	input_file.size = sm_reg_data->size;
-	input_file.size_ctl_structure = 0;
-	input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE;
-	input_file.mpool_size = sm_reg_data->size;
+    /* setup the input file for the shared memory connection manager */
+    input_file.file_name = sm_reg_data->file_name;
+    input_file.size = sm_reg_data->size;
+    input_file.size_ctl_structure = 0;
+    input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE;
+    input_file.mpool_size = sm_reg_data->size;
 
-	/* call the connection manager and map my shared memory peers' file
-	 */
-	ret = bcol_basesmuma_smcm_allgather_connection(
-			sm_bcol,
-			sm_bcol->super.sbgp_partner_module,
-			&(cs->sm_connections_list),
-			&(sm_bcol->payload_backing_files_info),
-			sm_bcol->super.sbgp_partner_module->group_comm,
-			input_file,cs->payload_base_fname,
-			false);
-	if( OMPI_SUCCESS != ret ) {
-		goto exit_ERROR;
-	}
+    /* call the connection manager and map my shared memory peers' file
+     */
+    ret = bcol_basesmuma_smcm_allgather_connection(
+        sm_bcol,
+        sm_bcol->super.sbgp_partner_module,
+        &(cs->sm_connections_list),
+        &(sm_bcol->payload_backing_files_info),
+        sm_bcol->super.sbgp_partner_module->group_comm,
+        input_file,cs->payload_base_fname,
+        false);
+    if( OMPI_SUCCESS != ret ) {
+        goto exit_ERROR;
+    }
 
 
-	/* now we exchange offset info - don't assume symmetric virtual memory
-	 */
+    /* now we exchange offset info - don't assume symmetric virtual memory
+     */
 
-        mem_offset = (uint64_t)(uintptr_t)(ml_block->block->base_addr) -
-            (uint64_t)(uintptr_t)(cs->sm_payload_structs->data_addr);
+    mem_offset = (uint64_t)(uintptr_t)(ml_block->block->base_addr) -
+        (uint64_t)(uintptr_t)(cs->sm_payload_structs->data_addr);
 
-        /* call into the exchange offsets function */
-        ret=comm_allgather_pml(&mem_offset,results_array,1,
-                MPI_LONG_LONG_INT,
-                sm_bcol_module->super.sbgp_partner_module->my_index,
-                sm_bcol_module->super.sbgp_partner_module->group_size,
-                sm_bcol_module->super.sbgp_partner_module->group_list,
-                sm_bcol_module->super.sbgp_partner_module->group_comm);
-        if( OMPI_SUCCESS != ret ) {
-            goto exit_ERROR;
-        }
+    /* call into the exchange offsets function */
+    ret=comm_allgather_pml(&mem_offset,results_array,1,MPI_LONG_LONG_INT,
+                           sm_bcol_module->super.sbgp_partner_module->my_index,
+                           sm_bcol_module->super.sbgp_partner_module->group_size,
+                           sm_bcol_module->super.sbgp_partner_module->group_list,
+                           sm_bcol_module->super.sbgp_partner_module->group_comm);
+    if( OMPI_SUCCESS != ret ) {
+        goto exit_ERROR;
+    }
 
-	/* convert memory offset to virtual address in current rank */
-	leading_dim = pload_mgmt->size_of_group;
-	loop_limit =  ml_block->num_banks*ml_block->num_buffers_per_bank;
+    /* convert memory offset to virtual address in current rank */
+    leading_dim = pload_mgmt->size_of_group;
+    loop_limit =  ml_block->num_banks*ml_block->num_buffers_per_bank;
     for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {
 
         /* get the base pointer */
@@ -447,29 +447,31 @@ int bcol_basesmuma_bank_init_opti(struct mca_coll_ml_module_t *ml_module,
         pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
             (uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr);
         /* second, calculate where to set the data pointer */
-        pload_mgmt->data_buffs[array_id].payload=(void *) 
-            (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + 
-                       (uint64_t)(uintptr_t) ml_module->data_offset);
-
+        pload_mgmt->data_buffs[array_id].payload=(void *)
+            (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
+                        (uint64_t)(uintptr_t) ml_module->data_offset);
 
         for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
             int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
             array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
-            /* now, play the same game as above 
+            /* now, play the same game as above
              *
              * first, set the control struct's position */
-            pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) 
+            pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
                 (uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) +
-                           (uint64_t)(uintptr_t)ml_block->size_buffer));
+                             (uint64_t)(uintptr_t)ml_block->size_buffer));
 
             /* second, set the payload pointer */
             pload_mgmt->data_buffs[array_id].payload =(void *)
                 (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
-                           (uint64_t)(uintptr_t) ml_module->data_offset); 
+                            (uint64_t)(uintptr_t) ml_module->data_offset);
         }
 
     }
 
+    /* done with the index array */
+    free (results_array);
+
     /* initialize my control structures!! */
     my_idx = sm_bcol_module->super.sbgp_partner_module->my_index;
     leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size;
@@ -499,30 +501,30 @@ int bcol_basesmuma_bank_init_opti(struct mca_coll_ml_module_t *ml_module,
             ml_block;
     }
 
-	ml_mem->num_banks = ml_block->num_banks;
+    ml_mem->num_banks = ml_block->num_banks;
     ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t));
-	ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank;
-	ml_mem->size_buffer = ml_block->size_buffer;
+    ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank;
+    ml_mem->size_buffer = ml_block->size_buffer;
     /* pointer to ml level descriptor */
     ml_mem->ml_mem_desc = ml_block;
 
-	if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc,
-						 ml_block->block->base_addr,
-						 ml_mem->num_banks,
-						 ml_mem->num_buffers_per_bank,
-						 ml_mem->size_buffer,
-						 ml_module->data_offset,
-						 sm_bcol_module->super.sbgp_partner_module->group_size,
-						 sm_bcol_module->pow_k)) {
+    if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc,
+                                               ml_block->block->base_addr,
+                                               ml_mem->num_banks,
+                                               ml_mem->num_buffers_per_bank,
+                                               ml_mem->size_buffer,
+                                               ml_module->data_offset,
+                                               sm_bcol_module->super.sbgp_partner_module->group_size,
+                                               sm_bcol_module->pow_k)) {
 
-	   BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n"));
-	   return OMPI_ERROR;
-	}
+        BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n"));
+        return OMPI_ERROR;
+    }
 
-	return OMPI_SUCCESS;
+    return OMPI_SUCCESS;
 
 exit_ERROR:
-	return ret;
+    return ret;
 }
 
 #endif
@@ -531,48 +533,45 @@ exit_ERROR:
 
 /* Basesmuma interface function used for buffer release */
 #if 0
-/* gvm 
- * A collective operation calls this routine to release the payload buffer. 
+/* gvm
+ * A collective operation calls this routine to release the payload buffer.
  * All processes in the shared memory sub-group of a bcol should call the non-blocking
- * barrier on the last payload buffer of a memory bank. On the completion 
- * of the non-blocking barrier, the ML callback is called which is responsible 
- * for recycling the memory bank. 
+ * barrier on the last payload buffer of a memory bank. On the completion
+ * of the non-blocking barrier, the ML callback is called which is responsible
+ * for recycling the memory bank.
  */
 mca_bcol_basesmuma_module_t *sm_bcol_module
 int bcol_basesmuma_free_payload_buff(
-		struct ml_memory_block_desc_t *block,
-		sm_buffer_mgmt *ctl_mgmt,
-		uint64_t buff_id)
+    struct ml_memory_block_desc_t *block,
+    sm_buffer_mgmt *ctl_mgmt,
+    uint64_t buff_id)
 {
-	/* local variables */
-	int ret = OMPI_SUCCESS;
+    /* local variables */
+    int ret = OMPI_SUCCESS;
 
-	 memory_bank = BANK_FROM_BUFFER_IDX(buff_id);
-	 ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed++;
+    memory_bank = BANK_FROM_BUFFER_IDX(buff_id);
+    ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed++;
 
- 	 OPAL_THREAD_ADD32(&(ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed),1);
+    OPAL_THREAD_ADD32(&(ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed),1);
 
-	 if (ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed == block->size_buffers_bank){
-		
-		/* start non-blocking barrier */
+    if (ctl_mgmt->ctl_buffs_mgmt[memory_bank].n_buffs_freed == block->size_buffers_bank){
+
+        /* start non-blocking barrier */
         bcol_basesmuma_rd_nb_barrier_init_admin(
-                 &(ctl_mgmt->ctl_buffs_mgmt[memory_bank].nb_barrier_desc));
- 	
-		if (NB_BARRIER_DONE !=
-                  ctl_mgmt->ctl_buffs_mgmt[memory_bank].
-                      nb_barrier_desc.collective_phase){
+            &(ctl_mgmt->ctl_buffs_mgmt[memory_bank].nb_barrier_desc));
 
-			/* progress the barrier */
-              opal_progress();
-		}
-		else{
-			/* free the buffer - i.e. initiate callback to ml level */
-			block->ml_release_cb(block,memory_bank);
-		}
-	 }
-	return ret;
+        if (NB_BARRIER_DONE !=
+            ctl_mgmt->ctl_buffs_mgmt[memory_bank].
+            nb_barrier_desc.collective_phase){
+
+            /* progress the barrier */
+            opal_progress();
+        }
+        else{
+            /* free the buffer - i.e. initiate callback to ml level */
+            block->ml_release_cb(block,memory_bank);
+        }
+    }
+    return ret;
 }
 #endif
-
-
-
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_gather.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_gather.c
new file mode 100644
index 0000000000..feed1d0c96
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_gather.c
@@ -0,0 +1,1107 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+
+/* debug
+ *   #include "opal/sys/timer.h"
+ *
+ *   extern uint64_t timers[7];
+ *   end debug */
+
+/* debug */
+#include <unistd.h>
+/* end debug */
+
+/* non-blocking gather routines: init and progress functions */
+int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    comm_attribs.bcoll_type = BCOL_GATHER;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 16;
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+    comm_attribs.waiting_semantics = BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000;
+    inv_attribs.datatype_bitmap = 0x11111111;
+    inv_attribs.op_types_bitmap = 0x11111111;
+
+    /* Set attributes for fanin fanout algorithm */
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                                 bcol_basesmuma_k_nomial_gather_init,
+                                 bcol_basesmuma_k_nomial_gather_progress);
+
+    return OMPI_SUCCESS;
+}
+
+int bcol_basesmuma_k_nomial_gather_init(bcol_function_args_t *input_args,
+                                        coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    int group_size;
+    int leading_dim, buff_idx, idx;
+    int src, i, j, k_temp1, k_temp2;
+    int pseudo_root, proxy_root, pseudo_base_adj;
+    volatile int8_t ready_flag;
+    int count=input_args->count;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+    int root=input_args->root;
+    int base_adj, base;
+    int total_peers, my_pow_k=0;
+    int64_t sequence_number=input_args->sequence_num;
+    mca_bcol_basesmuma_module_t* bcol_module=
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    netpatterns_k_exchange_node_t *exchange_node =
+        &bcol_module->knomial_allgather_tree;
+    uint32_t buffer_index = input_args->buffer_index;
+    int *active_requests =
+        &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
+
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
+    int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
+
+    int buff_offset = bcol_module->super.hier_scather_offset;
+
+    /* "indirectors" */
+    int *inv_map = exchange_node->inv_reindex_map;
+    int *reindex_map = exchange_node->reindex_map;
+    int stray = exchange_node->k_nomial_stray;
+
+    /* tree radix */
+    int tree_order = exchange_node->tree_order;
+    /* tree depth */
+    int pow_k =  exchange_node->log_tree_order;
+    /* largest power of k less than or equal to np */
+    int cnt = exchange_node->n_largest_pow_tree_order;
+
+    /* payload structures */
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+
+    /* control structures */
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+
+    size_t pack_len = 0, dt_size;
+
+#if 0
+    fprintf(stderr,"Entering sm gather input_args->sbuf_offset %d \n",input_args->sbuf_offset);
+    fflush(stderr);
+#endif
+
+
+    /* we will work only on packed data - so compute the length*/
+    /* this is the size of my data, this is not gatherv so it's the same
+     * for all ranks in the communicator.
+     */
+    ompi_datatype_type_size(dtype, &dt_size);
+    pack_len=count*dt_size;
+    /* now set the "real" offset */
+    buff_offset = buff_offset*pack_len;
+
+    buff_idx = input_args->src_desc->buffer_index;
+
+    /* Get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+    /* init the header */
+    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
+
+    /* init active requests, iteration, and status */
+    *iteration = 0;
+    *active_requests = 0;
+    *status = -1;
+    /* calculate the number of steps necessary for this collective */
+
+    /* first thing we do is figure out where the root is in our new indexing */
+    /* find root in new indexing */
+    pseudo_root = inv_map[root];
+    /* see if this is larger than the stray */
+    if (pseudo_root >= stray) {
+        /* then we need to define the proxy root, everyone can do this */
+        proxy_root = pseudo_root - cnt;
+    } else {
+        proxy_root = pseudo_root;
+    }
+
+    /* do some figuring */
+    if (EXCHANGE_NODE == exchange_node->node_type) {
+        total_peers = 0;
+        my_pow_k = pow_k;
+        k_temp1 = tree_order;
+        k_temp2 = 1;
+        for( i = 0; i < pow_k; i++) {
+            /* then find the base */
+            FIND_BASE(base,exchange_node->reindex_myid,i+1,tree_order);
+            /* now find the adjusted base */
+            base_adj = base + (base + proxy_root)%k_temp1;
+            /* ok, now find out WHO is occupying this slot */
+            pseudo_base_adj = reindex_map[base_adj];
+
+            if(my_rank == pseudo_base_adj ) {
+                /* then go ahead and poll for children's data */
+                for( j = 0; j < (tree_order - 1); j++ ) {
+                    /* send phase
+                     */
+                    /* get communication partner */
+
+                    src = exchange_node->rank_exchanges[i][j];
+                    /* remember, if we have extra ranks, then we won't participate
+                     * with a least one peer. Make a check
+                     */
+                    if( src < 0 ){
+                        continue;
+                    }else{
+
+                        /* flip a bit to represent this request */
+                        *active_requests ^= (1<<total_peers++);
+                    }
+
+
+                }
+            } else {
+                /* announce my arrival */
+                my_pow_k = i;
+                break;
+            }
+
+            k_temp1 = k_temp1*tree_order;
+            k_temp2 = k_temp2*tree_order;
+        }
+    }
+
+    *iteration = my_pow_k;
+
+    if (EXTRA_NODE == exchange_node->node_type || 0 == exchange_node->n_extra_sources) {
+        if (0 == my_pow_k || EXTRA_NODE == exchange_node->node_type) {
+            opal_atomic_rmb ();
+
+            my_ctl_pointer->flags[GATHER_FLAG][bcol_id] = ready_flag;
+        }
+
+        if ((EXTRA_NODE == exchange_node->node_type && root != my_rank) || 0 == my_pow_k) {
+            /* nothing more to do */
+            my_ctl_pointer->starting_flag_value[bcol_id]++;
+
+            return BCOL_FN_COMPLETE;
+        }
+    }
+
+    return BCOL_FN_STARTED;
+}
+
+
+int bcol_basesmuma_k_nomial_gather_progress(bcol_function_args_t *input_args,
+                                            coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    int group_size;
+    int flag_offset;
+    int leading_dim, buff_idx, idx;
+    int src, knt, i, j, k_temp1, k_temp2;
+    volatile int8_t ready_flag;
+    int count=input_args->count;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+    int root=input_args->root;
+    int probe;
+    int matched;
+    int64_t sequence_number=input_args->sequence_num;
+    mca_bcol_basesmuma_module_t* bcol_module=
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
+    netpatterns_k_exchange_node_t *exchange_node =
+        &bcol_module->knomial_allgather_tree;
+    uint32_t buffer_index = input_args->buffer_index;
+    int *active_requests =
+        &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
+    int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
+    int buff_offset = bcol_module->super.hier_scather_offset;
+    /* "indirectors" */
+    int *list_connected = bcol_module->super.list_n_connected;
+    /* tree radix */
+    int tree_order = exchange_node->tree_order;
+    /* payload structures */
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char *child_data_pointer;
+    /* control structures */
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
+    /*volatile mca_bcol_basesmuma_ctl_struct_t* parent_ctl_pointer; */
+
+    size_t pack_len = 0, dt_size;
+    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);
+
+
+#if 0
+    fprintf(stderr,"Entering sm gather input_args->sbuf_offset %d \n",input_args->sbuf_offset);
+    fflush(stderr);
+#endif
+
+
+    /* we will work only on packed data - so compute the length*/
+    /* this is the size of my data, this is not gatherv so it's the same
+     * for all ranks in the communicator.
+     */
+    ompi_datatype_type_size(dtype, &dt_size);
+    pack_len=count*dt_size;
+    /* now set the "real" offset */
+    buff_offset = buff_offset*pack_len;
+
+    buff_idx = input_args->src_desc->buffer_index;
+
+    /* Get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+    /* restart the ready_flag state */
+    flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];
+    ready_flag = flag_offset + 1;
+
+    /* calculate the number of steps necessary for this collective */
+
+    /* first thing we do is figure out where the root is in our new indexing */
+    /* find root in new indexing */
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+        /* poll for data from proxy */
+        src = exchange_node->rank_extra_sources_array[0];
+        /* get src data buffer */
+        child_data_pointer = data_buffs[src].payload;
+        child_ctl_pointer = data_buffs[src].ctl_struct;
+        /* remember to bump your flag */
+        ready_flag++;
+
+        /* in this case, you must block */
+        for (i = 0 ; i < cm->num_to_probe ; ++i) {
+            if (IS_PEER_READY(child_ctl_pointer,ready_flag,sequence_number, GATHER_FLAG, bcol_id)){
+                /* receive the data from the proxy, aka pseudo-root */
+                memcpy((void *) ((unsigned char *) data_addr + buff_offset),
+                       (void *) ((unsigned char *) child_data_pointer+buff_offset),
+                       pack_len * group_size);
+
+                goto FINISHED;
+            }
+        }
+
+        return BCOL_FN_STARTED;
+    }
+
+
+    if (0 < exchange_node->n_extra_sources && (-1 == (*status))) {
+        /* am a proxy, poll for pack_len data from extra */
+        src = exchange_node->rank_extra_sources_array[0];
+        /* get src data buffer */
+        child_data_pointer = data_buffs[src].payload;
+        child_ctl_pointer = data_buffs[src].ctl_struct;
+        knt = 0;
+        for( i = 0; i < src; i++){
+            knt += list_connected[i];
+        }
+        /* must block here also */
+        matched = 0;
+        for (i = 0, matched = 0 ; i < cm->num_to_probe && (0 == matched) ; ++i) {
+            if(IS_PEER_READY(child_ctl_pointer,ready_flag,sequence_number, GATHER_FLAG, bcol_id)){
+                matched = 1;
+                memcpy((void *) ((unsigned char *) data_addr + buff_offset + pack_len*knt),
+                       (void *) ((unsigned char *) child_data_pointer + buff_offset +
+                                 pack_len*knt), pack_len*list_connected[src]);
+                *status = 0;
+                if( 0 == *active_requests ){
+                    goto LAST_STEP;
+                }
+
+                break;
+            }
+        }
+        if( 0 == matched ){
+            return BCOL_FN_STARTED;
+        }
+    }
+
+    /* start the k-nomial gather phase */
+    /* only "active ranks participate, once a rank has forwarded its data, it becomes inactive */
+    for (probe = 0 ; probe < cm->num_to_probe ; ++probe) {
+        k_temp1 = tree_order;
+        k_temp2 = 1;
+        for (i = 0 ; i < *(iteration) ; ++i) {
+
+            /* then go ahead and poll for children's data */
+            for (j = 0 ; j < (tree_order - 1) ; ++j) {
+                /* send phase
+                 */
+                /* get communication partner */
+
+                src = exchange_node->rank_exchanges[i][j];
+                /* remember, if we have extra ranks, then we won't participate
+                 * with a least one peer. Make a check
+                 */
+                /* if the bit that corresponds to this child has been set to zero,
+                 * then it has already checked in and data received
+                 */
+                if (src < 0 || 1 != ((*active_requests >> ((tree_order - 1)*i + j))&1)){
+                    continue;
+                }
+                child_data_pointer = data_buffs[src].payload;
+                child_ctl_pointer = data_buffs[src].ctl_struct;
+
+                if(IS_PEER_READY(child_ctl_pointer,ready_flag,sequence_number, GATHER_FLAG, bcol_id)){
+                    /* copy the data */
+                    memcpy((void *) ((unsigned char *) data_addr + buff_offset +
+                                     exchange_node->payload_info[i][j].r_offset*pack_len),
+                           (void *) ((unsigned char *) child_data_pointer + buff_offset +
+                                     exchange_node->payload_info[i][j].r_offset*pack_len),
+                           exchange_node->payload_info[i][j].r_len*pack_len);
+                    /* flip the bit to zero */
+                    *active_requests ^= (1<<((tree_order - 1)*i + j));
+                    if(0 == (*active_requests)) {
+                        goto LAST_STEP;
+                    }
+                }
+            }
+        }
+
+        k_temp1 = k_temp1*tree_order;
+        k_temp2 = k_temp2*tree_order;
+    }
+
+
+    return BCOL_FN_STARTED;
+
+LAST_STEP:
+    /* last step, proxies send full data back to the extra ranks */
+    if( 0 < exchange_node->n_extra_sources &&
+        root == exchange_node->rank_extra_sources_array[0]) {
+        /* regardless, I will bump the ready flag and set it in case someone is watching */
+        /* announce that data is ready */
+        ready_flag++;
+    }
+
+    /* signal that data is ready */
+    opal_atomic_wmb ();
+    my_ctl_pointer->flags[GATHER_FLAG][bcol_id] = ready_flag;
+
+FINISHED:
+
+
+    my_ctl_pointer->starting_flag_value[bcol_id]++;
+
+    return BCOL_FN_COMPLETE;
+}
+
+
+/* Blocking routines, used to prototype and test signaling,
+ * as well as debug hierarchical algorithm
+ */
+#if 0
+int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    comm_attribs.bcoll_type = BCOL_GATHER;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 16;
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+    comm_attribs.waiting_semantics = BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000;
+    inv_attribs.datatype_bitmap = 0x11111111;
+    inv_attribs.op_types_bitmap = 0x11111111;
+
+
+    /* Set attributes for fanin fanout algorithm */
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_k_nomial_gather,
+                                 bcol_basesmuma_k_nomial_gather);
+
+    return OMPI_SUCCESS;
+}
+#endif
+
+
+/* original, fully blocking, fully synchronous gather - should result in worst performance when used */
+#if 0
+int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args,
+                                   coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    int group_size;
+    int first_instance=0, flag_offset;
+    int rc = OMPI_SUCCESS;
+    int leading_dim, buff_idx, idx;
+    int *group_list;
+    int src, comm_src, knt, i, k, j, k_temp1, k_temp2;
+    int pseudo_root, proxy_root, pseudo_base_adj;
+    volatile int64_t ready_flag;
+    int count=input_args->count;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+    int root=input_args->root;
+    int base_adj, base;
+    int64_t sequence_number=input_args->sequence_num;
+    mca_bcol_basesmuma_module_t* bcol_module=
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    netpatterns_k_exchange_node_t *exchange_node =
+        &bcol_module->knomial_allgather_tree;
+
+    int buff_offset = bcol_module->super.hier_scather_offset;
+
+    /* "indirectors" */
+    int *list_connected = bcol_module->super.list_n_connected;
+    int *inv_map = exchange_node->inv_reindex_map;
+    int *reindex_map = exchange_node->reindex_map;
+    /*int *reindex_map = exchange_node->reindex_map;*/
+    /* stray rank == first rank in the extra set */
+    int stray = exchange_node->k_nomial_stray;
+
+    /* tree radix */
+    int tree_order = exchange_node->tree_order;
+    /* tree depth */
+    int pow_k =  exchange_node->log_tree_order;
+    /* largest power of k less than or equal to np */
+    int cnt = exchange_node->n_largest_pow_tree_order;
+
+    /*fprintf(stderr,"tree order %d pow_k %d stray %d root %d\n",tree_order, pow_k, stray, root);*/
+    /* payload structures */
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char *child_data_pointer;
+
+    /* control structures */
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
+    /*volatile mca_bcol_basesmuma_ctl_struct_t* parent_ctl_pointer; */
+
+    size_t pack_len = 0, dt_size;
+    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);
+
+    /* active in the algorithm */
+    bool active = true;
+
+#if 0
+    fprintf(stderr,"Entering sm gather input_args->sbuf_offset %d \n",input_args->sbuf_offset);
+    fflush(stderr);
+#endif
+
+
+    /* we will work only on packed data - so compute the length*/
+    /* this is the size of my data, this is not gatherv so it's the same
+     * for all ranks in the communicator.
+     */
+    ompi_datatype_type_size(dtype, &dt_size);
+    pack_len=count*dt_size;
+    /* now set the "real" offset */
+    buff_offset = buff_offset*pack_len;
+
+    buff_idx = input_args->src_desc->buffer_index;
+
+    /* Get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    /* I have a feeling that I'll need this */
+    group_list = bcol_module->super.sbgp_partner_module->group_list;
+
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+    /*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+      bcol_module->colls_with_user_data.ctl_buffs+idx;
+    */
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    /*my_ctl_pointer = ctl_structs[my_rank]; */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+    /* setup resource recycling */
+    if( my_ctl_pointer->sequence_number < sequence_number ) {
+        first_instance=1;
+    }
+
+    if( first_instance ) {
+        /* Signal arrival */
+        my_ctl_pointer->flag = -1;
+        my_ctl_pointer->gflag = -1;
+        my_ctl_pointer->index=1;
+        /* this does not need to use any flag values , so only need to
+         * set the value for subsequent values that may need this */
+        my_ctl_pointer->starting_flag_value=0;
+        flag_offset=0;
+
+    } else {
+        /* only one thread at a time will be making progress on this
+         *   collective, so no need to make this atomic */
+        my_ctl_pointer->index++;
+    }
+
+
+    /* increment the starting flag by one and return */
+    flag_offset = my_ctl_pointer->starting_flag_value;
+    ready_flag = flag_offset + sequence_number + 1;
+    my_ctl_pointer->sequence_number = sequence_number;
+
+/* debug
+   fprintf(stderr," sequence_number %lld flag_offset %d starting flag val %d\n",sequence_number,flag_offset, my_ctl_pointer->starting_flag_value);
+   fflush(stderr);
+   end debug */
+
+
+    /*
+     * Fan out from root
+     */
+    /* don't need this either */
+    /* root is the local leader */
+    /* calculate the number of steps necessary for this collective */
+
+    /* first thing we do is figure out where the root is in our new indexing */
+    /* find root in new indexing */
+    pseudo_root = inv_map[root];
+    /* see if this is larger than the stray */
+    if( pseudo_root >= stray ) {
+        /* then we need to define the proxy root, everyone can do this */
+        proxy_root = pseudo_root - cnt;
+    }else {
+        proxy_root = pseudo_root;
+    }
+
+
+
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+        /* signal arrival */
+        my_ctl_pointer->gflag = ready_flag;
+
+        /* send is done */
+
+        /* poll for data only if I am the root */
+        /* bump the ready flag */
+        ready_flag++;
+        if( root == my_rank ){
+            /* poll for data from proxy */
+            src = exchange_node->rank_extra_sources_array[0];
+            /* get src data buffer */
+            child_data_pointer = data_buffs[src].payload;
+            child_ctl_pointer = data_buffs[src].ctl_struct;
+            while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){
+                opal_progress();
+            }
+            /* receive the data from the proxy, aka pseudo-root */
+
+            memcpy((void *) ((unsigned char *) data_addr + buff_offset),(void *) ((unsigned char *) child_data_pointer+buff_offset)
+                   ,pack_len*group_size);
+        }
+        goto FINISHED;
+
+
+    } else if( 0 < exchange_node->n_extra_sources ) {
+
+        /* am a proxy, poll for pack_len data from extra */
+        src = exchange_node->rank_extra_sources_array[0];
+        /* get src data buffer */
+        child_data_pointer = data_buffs[src].payload;
+        child_ctl_pointer = data_buffs[src].ctl_struct;
+        knt = 0;
+        for( i = 0; i < src; i++){
+            knt += list_connected[i];
+        }
+        while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){
+            opal_progress();
+        }
+        memcpy((void *) ((unsigned char *) data_addr + buff_offset + pack_len*knt),
+               (void *) ((unsigned char *) child_data_pointer + buff_offset +
+                         pack_len*knt), pack_len*list_connected[src]);
+        /*fprintf(stderr,"999 proxy received data from %d at offset %d of length %d\n",src,
+          buff_offset+pack_len*knt,pack_len*list_connected[src]);
+        */
+    }
+
+    /* start the k-nomial gather phase */
+    /* only "active ranks participate, once a rank has forwarded its data, it becomes inactive */
+    knt = 0;
+    while(active){
+        k_temp1 = tree_order;
+        k_temp2 = 1;
+        for( i = 0; i < pow_k; i++) {
+            /* then find the base */
+            /*FIND_BASE(base,my_rank,i+1,tree_order);*/
+            FIND_BASE(base,exchange_node->reindex_myid,i+1,tree_order);
+            /* now find the adjusted base */
+            base_adj = base + (base + proxy_root)%k_temp1;
+            /* ok, now find out WHO is occupying this slot */
+            /*pseudo_base_adj = inv_map[base_adj];*/
+            pseudo_base_adj = reindex_map[base_adj];
+
+            if(my_rank == pseudo_base_adj ) {
+                /* then go ahead and poll for children's data */
+                for( j = 0; j < (tree_order - 1); j++ ) {
+                    /* send phase
+                     */
+                    /* get communication partner */
+
+                    src = exchange_node->rank_exchanges[i][j];
+                    /*fprintf(stderr,"comm_src %d\n",comm_src);*/
+                    /* remember, if we have extra ranks, then we won't participate
+                     * with a least one peer. Make a check
+                     */
+                    if( src < 0 ){
+                        continue;
+                    }
+
+                    /*fprintf(stderr,"src %d\n",src);*/
+                    child_data_pointer = data_buffs[src].payload;
+                    child_ctl_pointer = data_buffs[src].ctl_struct;
+                    while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){
+                        opal_progress();
+                    }
+                    memcpy((void *) ((unsigned char *) data_addr + buff_offset +
+                                     exchange_node->payload_info[i][j].r_offset*pack_len),
+                           (void *) ((unsigned char *) child_data_pointer + buff_offset +
+                                     exchange_node->payload_info[i][j].r_offset*pack_len),
+                           exchange_node->payload_info[i][j].r_len*pack_len);
+                    /*
+                      fprintf(stderr,"999 receiving data from %d at offset %d of length %d\n",
+                      exchange_node->rank_exchanges[i][j], buff_offset + exchange_node->payload_info[i][j].r_offset,
+                      exchange_node->payload_info[i][j].r_len*pack_len);
+                    */
+                    MB();
+                    knt++;
+                    if(knt == exchange_node->n_actual_exchanges) {
+                        /* this is the trick to break the root out,
+                         * only the root should be able to satisfy this
+                         */
+                        /*
+                          fprintf(stderr,"hello n_actual is %d \n",knt);
+                          fprintf(stderr,"hello n_actual_exch is %d \n",
+                          exchange_node->n_actual_exchanges);
+                        */
+                        goto LAST_STEP;
+                    }
+                }
+            } else {
+                /* announce my arrival */
+                my_ctl_pointer->gflag = ready_flag;
+                active = false;
+                break;
+            }
+
+            k_temp1 = k_temp1*tree_order;
+            k_temp2 = k_temp2*tree_order;
+        }
+    }
+LAST_STEP:
+    /* last step, proxies send full data back to the extra ranks */
+    if( 0 < exchange_node->n_extra_sources &&
+        root == exchange_node->rank_extra_sources_array[0]) {
+        /* regardless, I will bump the ready flag and set it in case someone is watching */
+        /* announce that data is ready */
+        ready_flag++;
+        my_ctl_pointer->gflag = ready_flag;
+    }
+
+
+FINISHED:
+
+/* debug
+   fprintf(stderr," my_ctl_pointer->index %d n of this type %d %u \n",
+   my_ctl_pointer->index,c_input_args->n_of_this_type_in_collective,getpid());
+   fflush(stderr);
+   end debug */
+
+    my_ctl_pointer->starting_flag_value+=1;
+
+    return BCOL_FN_COMPLETE;
+}
+
+#endif
+
+
+#if 0
+/* blocking, asynchronous polling gather routine */
+int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args,
+                                   coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    int group_size;
+    int first_instance=0, flag_offset;
+    int rc = OMPI_SUCCESS;
+    int leading_dim, buff_idx, idx;
+    int *group_list;
+    int src, comm_src, knt, i, k, j, k_temp1, k_temp2;
+    int pseudo_root, proxy_root, pseudo_base_adj;
+    volatile int64_t ready_flag;
+    int count=input_args->count;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+    int root=input_args->root;
+    int base_adj, base;
+    int total_peers, my_pow_k;
+    int probe;
+    int matched;
+    int64_t sequence_number=input_args->sequence_num;
+    mca_bcol_basesmuma_module_t* bcol_module=
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
+    netpatterns_k_exchange_node_t *exchange_node =
+        &bcol_module->knomial_allgather_tree;
+
+    int buff_offset = bcol_module->super.hier_scather_offset;
+
+    /* "indirectors" */
+    int *list_connected = bcol_module->super.list_n_connected;
+    int *inv_map = exchange_node->inv_reindex_map;
+    int *reindex_map = exchange_node->reindex_map;
+    /*int *reindex_map = exchange_node->reindex_map;*/
+    /* stray rank == first rank in the extra set */
+    int stray = exchange_node->k_nomial_stray;
+
+    /* tree radix */
+    int tree_order = exchange_node->tree_order;
+    /* tree depth */
+    int pow_k =  exchange_node->log_tree_order;
+    /* largest power of k less than or equal to np */
+    int cnt = exchange_node->n_largest_pow_tree_order;
+
+    /*fprintf(stderr,"tree order %d pow_k %d stray %d root %d\n",tree_order, pow_k, stray, root);*/
+    /* payload structures */
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char *child_data_pointer;
+
+    /* control structures */
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t *child_ctl_pointer;
+    /*volatile mca_bcol_basesmuma_ctl_struct_t* parent_ctl_pointer; */
+
+    size_t pack_len = 0, dt_size;
+    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);
+
+    /* active in the algorithm */
+    bool active = true;
+
+#if 0
+    fprintf(stderr,"Entering sm gather root %d \n",root);
+    fflush(stderr);
+#endif
+
+
+    /* we will work only on packed data - so compute the length*/
+    /* this is the size of my data, this is not gatherv so it's the same
+     * for all ranks in the communicator.
+     */
+    ompi_datatype_type_size(dtype, &dt_size);
+    pack_len=count*dt_size;
+    /* now set the "real" offset */
+    buff_offset = buff_offset*pack_len;
+
+    buff_idx = input_args->src_desc->buffer_index;
+
+    /* Get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    /* I have a feeling that I'll need this */
+    group_list = bcol_module->super.sbgp_partner_module->group_list;
+
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+    /*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+      bcol_module->colls_with_user_data.ctl_buffs+idx;
+    */
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    /*my_ctl_pointer = ctl_structs[my_rank]; */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+    /* setup resource recycling */
+    if( my_ctl_pointer->sequence_number < sequence_number ) {
+        first_instance=1;
+    }
+
+    if( first_instance ) {
+        /* Signal arrival */
+        my_ctl_pointer->flag = -1;
+        my_ctl_pointer->gflag = -1;
+        my_ctl_pointer->index=1;
+        /* this does not need to use any flag values , so only need to
+         * set the value for subsequent values that may need this */
+        my_ctl_pointer->starting_flag_value=0;
+        flag_offset=0;
+
+    } else {
+        /* only one thread at a time will be making progress on this
+         *   collective, so no need to make this atomic */
+        my_ctl_pointer->index++;
+    }
+
+
+    /* increment the starting flag by one and return */
+    flag_offset = my_ctl_pointer->starting_flag_value;
+    ready_flag = flag_offset + sequence_number + 1;
+    my_ctl_pointer->sequence_number = sequence_number;
+
+/* debug
+   fprintf(stderr," sequence_number %lld flag_offset %d starting flag val %d\n",sequence_number,flag_offset, my_ctl_pointer->starting_flag_value);
+   fflush(stderr);
+   end debug */
+
+
+    /*
+     * Fan out from root
+     */
+    /* don't need this either */
+    /* root is the local leader */
+    /* calculate the number of steps necessary for this collective */
+
+    /* first thing we do is figure out where the root is in our new indexing */
+    /* find root in new indexing */
+    pseudo_root = inv_map[root];
+    /* see if this is larger than the stray */
+    if( pseudo_root >= stray ) {
+        /* then we need to define the proxy root, everyone can do this */
+        proxy_root = pseudo_root - cnt;
+    }else {
+        proxy_root = pseudo_root;
+    }
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+        /* signal arrival */
+        my_ctl_pointer->gflag = ready_flag;
+
+        /* send is done */
+
+        /* poll for data only if I am the root */
+        /* bump the ready flag */
+        ready_flag++;
+        if( root == my_rank ){
+            /* poll for data from proxy */
+            src = exchange_node->rank_extra_sources_array[0];
+            /* get src data buffer */
+            child_data_pointer = data_buffs[src].payload;
+            child_ctl_pointer = data_buffs[src].ctl_struct;
+            /* in this case, you must block */
+            while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){
+                opal_progress();
+            }
+            /* receive the data from the proxy, aka pseudo-root */
+
+            memcpy((void *) ((unsigned char *) data_addr + buff_offset),
+                   (void *) ((unsigned char *) child_data_pointer+buff_offset)
+                   ,pack_len*group_size);
+        }
+        goto FINISHED;
+
+
+    } else if( 0 < exchange_node->n_extra_sources ) {
+
+        /* am a proxy, poll for pack_len data from extra */
+        src = exchange_node->rank_extra_sources_array[0];
+        /* get src data buffer */
+        child_data_pointer = data_buffs[src].payload;
+        child_ctl_pointer = data_buffs[src].ctl_struct;
+        knt = 0;
+        for( i = 0; i < src; i++){
+            knt += list_connected[i];
+        }
+        /* must block here also */
+        while(!IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){
+            opal_progress();
+        }
+        memcpy((void *) ((unsigned char *) data_addr + buff_offset + pack_len*knt),
+               (void *) ((unsigned char *) child_data_pointer + buff_offset +
+                         pack_len*knt), pack_len*list_connected[src]);
+        /*fprintf(stderr,"999 proxy received data from %d at offset %d of length %d\n",src,
+          buff_offset+pack_len*knt,pack_len*list_connected[src]);
+        */
+    }
+    /* do some figuring */
+
+    total_peers = 0;
+    my_pow_k = pow_k;
+    k_temp1 = tree_order;
+    k_temp2 = 1;
+    for( i = 0; i < pow_k; i++) {
+        /* then find the base */
+        /*FIND_BASE(base,my_rank,i+1,tree_order);*/
+        FIND_BASE(base,exchange_node->reindex_myid,i+1,tree_order);
+        /* now find the adjusted base */
+        base_adj = base + (base + proxy_root)%k_temp1;
+        /* ok, now find out WHO is occupying this slot */
+        /*pseudo_base_adj = inv_map[base_adj];*/
+        pseudo_base_adj = reindex_map[base_adj];
+
+        if(my_rank == pseudo_base_adj ) {
+            /* then go ahead and poll for children's data */
+            for( j = 0; j < (tree_order - 1); j++ ) {
+                /* send phase
+                 */
+                /* get communication partner */
+
+                src = exchange_node->rank_exchanges[i][j];
+                /*fprintf(stderr,"comm_src %d\n",comm_src);*/
+                /* remember, if we have extra ranks, then we won't participate
+                 * with a least one peer. Make a check
+                 */
+                if( src < 0 ){
+                    continue;
+                }else{
+                    total_peers++;
+                }
+
+
+            }
+        } else {
+            /* announce my arrival */
+            my_pow_k = i;
+            break;
+        }
+
+        k_temp1 = k_temp1*tree_order;
+        k_temp2 = k_temp2*tree_order;
+    }
+
+    if( 0 == my_pow_k ){
+        /* signal arrival */
+        my_ctl_pointer->gflag = ready_flag;
+
+        goto FINISHED;
+    }
+
+
+
+    /* start the k-nomial gather phase */
+    /* only "active ranks participate, once a rank has forwarded its data, it becomes inactive */
+    knt = 0;
+    while(active){
+        k_temp1 = tree_order;
+        k_temp2 = 1;
+        for( i = 0; i < my_pow_k; i++) {
+
+            /* then go ahead and poll for children's data */
+            for( j = 0; j < (tree_order - 1); j++ ) {
+                matched = 0;
+                /* send phase
+                 */
+                /* get communication partner */
+
+                src = exchange_node->rank_exchanges[i][j];
+                /*fprintf(stderr,"comm_src %d\n",comm_src);*/
+                /* remember, if we have extra ranks, then we won't participate
+                 * with a least one peer. Make a check
+                 */
+                if( src < 0 ){
+                    continue;
+                }
+
+                /*fprintf(stderr,"src %d\n",src);*/
+                child_data_pointer = data_buffs[src].payload;
+                child_ctl_pointer = data_buffs[src].ctl_struct;
+
+                /* if child has been marked, then skip */
+                if( sequence_number == child_ctl_pointer->mark ){
+                    continue;
+                }
+
+
+                for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
+                    if(IS_GDATA_READY(child_ctl_pointer,ready_flag,sequence_number)){
+                        /* mark the child's pointer */
+                        child_ctl_pointer->mark = sequence_number;
+                        /* copy the data */
+
+                        memcpy((void *) ((unsigned char *) data_addr + buff_offset +
+                                         exchange_node->payload_info[i][j].r_offset*pack_len),
+                               (void *) ((unsigned char *) child_data_pointer + buff_offset +
+                                         exchange_node->payload_info[i][j].r_offset*pack_len),
+                               exchange_node->payload_info[i][j].r_len*pack_len);
+                        /*
+                          fprintf(stderr,"999 receiving data from %d at offset %d of length %d\n",
+                          exchange_node->rank_exchanges[i][j], buff_offset + exchange_node->payload_info[i][j].r_offset,
+                          exchange_node->payload_info[i][j].r_len*pack_len);
+                        */
+                        knt++;
+                        if(knt == total_peers) {
+                            /* this is the trick to break the root out,
+                             * only the root should be able to satisfy this
+                             */
+                            /*
+                              fprintf(stderr,"hello n_actual is %d \n",knt);
+                              fprintf(stderr,"hello n_actual_exch is %d \n",
+                              exchange_node->n_actual_exchanges);
+                            */
+                            MB();
+                            my_ctl_pointer->gflag = ready_flag;
+
+                            goto LAST_STEP;
+                        }
+                        matched = 1;
+                    }else{
+                        opal_progress();
+                    }
+                }
+            }
+        }
+
+        k_temp1 = k_temp1*tree_order;
+        k_temp2 = k_temp2*tree_order;
+    }
+LAST_STEP:
+    /* last step, proxies send full data back to the extra ranks */
+    if( 0 < exchange_node->n_extra_sources &&
+        root == exchange_node->rank_extra_sources_array[0]) {
+        /* regardless, I will bump the ready flag and set it in case someone is watching */
+        /* announce that data is ready */
+        ready_flag++;
+        my_ctl_pointer->gflag = ready_flag;
+    }
+
+
+FINISHED:
+
+/* debug
+   fprintf(stderr," my_ctl_pointer->index %d n of this type %d %u \n",
+   my_ctl_pointer->index,c_input_args->n_of_this_type_in_collective,getpid());
+   fflush(stderr);
+   end debug */
+
+    my_ctl_pointer->starting_flag_value+=1;
+
+    return BCOL_FN_COMPLETE;
+}
+#endif
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.c
new file mode 100644
index 0000000000..8510258eca
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.c
@@ -0,0 +1,1878 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef __PORTALS_AVAIL__
+#define __PORTALS_ENABLE__
+
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+
+#include "bcol_basesmuma.h"
+#include "bcol_basesmuma_portals.h"
+#include "bcol_basesmuma_lmsg_bcast.h"
+#include "bcol_basesmuma_utils.h"
+
+
+
+/*
+ * Scatter/Gather Broadcast algorithm
+ *
+ * Algorithm highlights:
+ *
+ * Uses portals for data transfer
+ *
+ * All processes participating in the broadcast are arranged in a
+ * binmoial tree.
+ *
+ * Phase1: Scatter the broadcast data to all the children
+ * Phase2: All processes in the tree participates in recursive doubling
+ * algorithm to obtain the missing data.
+ */
+
+
+static int completed_scatter = 0;
+#if 0
+int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast_old(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args)
+{
+
+    /* local variables */
+    int i;
+    uint64_t length;
+    int my_rank, parent_rank, src =-1, matched = 0;
+    int *src_list = NULL;
+    int group_size = -1, dummy_group_size;
+    int first_instance=0;
+    int rc = OMPI_SUCCESS;
+    int leading_dim, buff_idx, idx;
+    int count=input_args->count;
+    size_t pack_len = 0, dt_size =0 ;
+	int64_t ready_flag;
+    int flag_offset;
+    int pow_2, pow_2_levels;
+    int src_list_index = -1;
+    uint64_t fragment_size;  /* user buffer size */
+	int sg_matchbits = 0;
+	/* Input argument variables */
+	void *my_userbuf = (void*)((unsigned char*)input_args->userbuf);
+	int64_t sequence_number=input_args->sequence_num;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+
+	/* Extra source variables */
+	bool secondary_root = false;
+	int partner = -1, extra_partner = -1;
+
+	/* Scatter Allgather offsets */
+	uint64_t local_sg_offset = 0, global_sg_offset = 0, partner_offset = 0;
+
+	/* Portals messaging relevant variables */
+	mca_bcol_basesmuma_portal_proc_info_t *portals_info;
+	ptl_handle_eq_t allgather_eq_h;
+	ptl_event_t  allgather_event;
+	bool blocked_post = false;
+	bool msg_posted = false;
+	int total_msg_posts = -1, scatter_posts = -1, allgather_posts = -1, extra_src_posts = -1;
+
+	/* OMPI module and component variables */
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    mca_bcol_basesmuma_module_t *bcol_module =
+        (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+
+    /* Control structure and payload variables */
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+	volatile mca_bcol_basesmuma_header_t *my_ctl_pointer = NULL;
+	volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer = NULL;
+	volatile mca_bcol_basesmuma_header_t *partner_ctl_pointer = NULL;
+
+	struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL;
+	struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL;
+	struct mca_bcol_basesmuma_portal_buf_addr_t *partner_lmsg_ctl_pointer = NULL;
+
+	/* Make sure there userbuffer is not null */
+	assert(my_userbuf != NULL);
+
+    /* Get portals info*/
+	portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
+
+	/* Get addresing information */
+    buff_idx = input_args->src_desc->buffer_index;
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    /* calculate the largest power of two that is smaller than
+     * or equal to the group size
+     */
+    pow_2_levels = pow_sm_k(2,group_size, &(dummy_group_size));
+    if( group_size < (1<<pow_2_levels)) {
+        pow_2_levels--;
+    }
+    /* power-of-two group size */
+    pow_2 = 1<<pow_2_levels;
+
+
+     /* we will work only on packed data - so compute the length*/
+    ompi_datatype_type_size(dtype, &dt_size);
+    fragment_size = count*dt_size;
+
+    /* grab the data buffs */
+    data_buffs = (mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+	my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*)
+							data_buffs[my_rank].payload;
+
+    if(my_ctl_pointer->sequence_number < sequence_number) {
+        first_instance = 1;
+    }
+
+    if(first_instance) {
+        my_ctl_pointer->flag = -1;
+        my_ctl_pointer->index = 1;
+
+        my_ctl_pointer->starting_flag_value = 0;
+        flag_offset = 0;
+
+    } else {
+        my_ctl_pointer->index++;
+    }
+
+	assert( -1 == my_ctl_pointer->flag);
+
+    /* increment the starting flag by one and return */
+    flag_offset = my_ctl_pointer->starting_flag_value;
+    ready_flag = flag_offset + sequence_number + 1;
+
+    my_ctl_pointer->sequence_number = sequence_number;
+	sg_matchbits = sequence_number ;
+
+	/* Construct my portal buffer address and copy to payload buffer */
+	mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer,
+						portals_info->portal_id.nid,
+						portals_info->portal_id.pid,
+						sg_matchbits,
+						bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
+
+	my_lmsg_ctl_pointer->userbuf = my_userbuf;
+	my_lmsg_ctl_pointer->userbuf_length = fragment_size;
+
+
+	/*
+	 * If I am the root of bcast, scatter the data to my children
+	 */
+    if (input_args->root_flag) {
+        BASESMUMA_VERBOSE(10,("I am the root of the data"));
+        my_lmsg_ctl_pointer->offset = 0;
+        my_lmsg_ctl_pointer->n_sends = pow_2_levels;
+        my_lmsg_ctl_pointer->length = fragment_size;
+
+	rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+						PTL_EQ_HANDLER_NONE, &allgather_eq_h);
+
+		if (rc != PTL_OK) {
+		BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc));
+			goto Release;
+	}
+
+		/* Compute number of posts required
+		 * We post the data buffer for both scatter and allgather phase at once so to avoid
+		 * posting overhead
+		 */
+		if (my_rank >= pow_2) {
+			/* I am root and my rank is greater than pow_2, I will hand
+			 * over to rank (that is < pow_2) to act as secondary root
+			 */
+			total_msg_posts  = 1;
+		}
+		else {
+
+			extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0;
+			scatter_posts = my_lmsg_ctl_pointer->n_sends;
+			allgather_posts = pow_2_levels - 1;
+
+			total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+		}
+
+		 mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer,
+						   my_userbuf, fragment_size, allgather_eq_h,
+						   total_msg_posts,
+						   blocked_post,
+						  PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+		 msg_posted = true ;
+        /* important that these be set before my children
+         * see the ready flag raised
+         */
+        MB();
+        my_ctl_pointer->flag = ready_flag;
+
+		/* Wait for my scatter partner */
+		if (my_rank >= pow_2) {
+			int scatter_partner = -1;
+			volatile mca_bcol_basesmuma_header_t *scatter_partner_ctl_pointer = NULL;
+
+			scatter_partner = my_rank - pow_2;
+			scatter_partner_ctl_pointer =
+					data_buffs[scatter_partner].ctl_struct;
+
+			while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, ready_flag,
+									sequence_number)){
+					opal_progress();
+			}
+
+			goto Release;
+		}
+		else {
+			wait_for_peers(my_rank, my_lmsg_ctl_pointer->n_sends, data_buffs,
+							ready_flag, sequence_number);
+		}
+
+		goto Allgather;
+    }
+
+
+Extra :
+    if( my_rank >= pow_2 ) {
+        parent_rank = my_rank & (pow_2-1);
+        parent_ctl_pointer = data_buffs[parent_rank].ctl_struct;
+		parent_lmsg_ctl_pointer =
+				(mca_bcol_basesmuma_portal_buf_addr_t*)data_buffs[parent_rank].payload;
+
+		ready_flag = ready_flag + pow_2_levels;
+
+		while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) {
+
+				opal_progress();
+
+        }
+
+
+		mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer,
+								parent_lmsg_ctl_pointer, 0,
+								0, fragment_size);
+
+		my_ctl_pointer->flag = ready_flag;
+
+		goto Release;
+    }
+
+Scatter:
+
+    /* I am not root of bcast compute the list of possible
+	 * where I will receive bcast data from.
+	 */
+    src_list = (int *) malloc(sizeof(int) * (pow_2_levels + 1));
+    for( i = 0; i < pow_2_levels; i++) {
+        src_list[i] = my_rank ^ (1<<i);
+    }
+
+	/* My source might be process > pow_2 */
+	if ((my_rank + pow_2) < group_size) {
+            src_list[i] = my_rank + pow_2;
+    } else {
+            src_list[i] = -1;
+    }
+
+Probe:
+
+    /* If I am not the root, then poll on possible "senders'" control structs */
+	/* For portals we block for now */
+	while (!matched) {
+      /* Shared memory iprobe */
+      SG_LARGE_MSG_PROBE(src_list, pow_2_levels + 1,
+                src_list_index, matched, src, data_buffs, parent_ctl_pointer,
+				parent_lmsg_ctl_pointer,ready_flag, sequence_number);
+    }
+
+	/* If I am a secondary root
+	 * Secondary root acts as root of bcast data when real root of data
+	 * is process with group rank greater than pow_2 */
+	if ((matched) && (src == pow_2 + my_rank)) {
+		volatile mca_bcol_basesmuma_header_t *extra_src_ctl_pointer = NULL;
+		struct mca_bcol_basesmuma_portal_buf_addr_t *extra_src_lmsg_ctl_pointer = NULL;
+
+		secondary_root = true;
+        BASESMUMA_VERBOSE(10,("I am the secondary root for the data"));
+        my_lmsg_ctl_pointer->offset = 0;
+        my_lmsg_ctl_pointer->n_sends = pow_2_levels;
+        my_lmsg_ctl_pointer->length = fragment_size;
+
+		extra_src_ctl_pointer = data_buffs[src].ctl_struct;
+		extra_src_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*)data_buffs[src].payload;
+
+		/* create an event queue for the incoming buffer */
+	rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+						PTL_EQ_HANDLER_NONE, &allgather_eq_h);
+
+		if (rc != PTL_OK) {
+		BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc));
+			goto Release;
+	}
+
+		mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer,
+								extra_src_lmsg_ctl_pointer, 0,
+								0, fragment_size);
+
+
+		extra_src_posts = 0;
+		scatter_posts = my_lmsg_ctl_pointer->n_sends;
+		allgather_posts = pow_2_levels - 1;
+
+		total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+
+		mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer,
+						   my_userbuf, fragment_size, allgather_eq_h,
+						   total_msg_posts,
+						   blocked_post,
+						  PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+		msg_posted = true ;
+        /* important that these be set before my children
+         * see the ready flag raised
+         */
+        MB();
+        my_ctl_pointer->flag = ready_flag;
+
+		wait_for_peers(my_rank, my_lmsg_ctl_pointer->n_sends, data_buffs,
+							ready_flag, sequence_number);
+		goto Allgather;
+    }
+
+    /* Verify whether we got the right
+	 * source of the data, by computing the source's intended
+	 * destinations
+     */
+    for( i = 0; i < parent_lmsg_ctl_pointer->n_sends; i++) {
+		uint64_t local_offset = 0;
+		uint64_t remote_offset = 0;
+
+		BASESMUMA_VERBOSE(5,("%d found it from %d \n",my_rank,src));
+
+       if( my_rank == (src^(1<<i))) {
+            parent_ctl_pointer = data_buffs[src].ctl_struct;
+            parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*)data_buffs[src].payload;
+
+            /* we found our root within the group ... */
+            BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
+
+			my_lmsg_ctl_pointer->n_sends = i;
+
+		    /* Am I source for other process during scatter phase */
+            if ( i > 0) {
+
+                /* compute the size of the chunk to copy */
+                length = (parent_lmsg_ctl_pointer->length)/
+                    (1<<(parent_lmsg_ctl_pointer->n_sends - my_lmsg_ctl_pointer->n_sends));
+                my_lmsg_ctl_pointer->length = length;
+                my_lmsg_ctl_pointer->offset =
+				parent_lmsg_ctl_pointer->offset + length;
+
+
+				local_offset = my_lmsg_ctl_pointer->offset;
+				remote_offset = parent_lmsg_ctl_pointer->offset + length;
+
+				mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer,
+								parent_lmsg_ctl_pointer,local_offset,
+								remote_offset, length);
+				rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+									cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+													PTL_EQ_HANDLER_NONE,
+													&allgather_eq_h);
+
+				if (rc != PTL_OK) {
+				BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc));
+					goto Release;
+			}
+
+				/* Now post the message for other children to read */
+				extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0;
+				scatter_posts = my_lmsg_ctl_pointer->n_sends;
+				allgather_posts = pow_2_levels - 1;
+
+				total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+
+
+				mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer,
+						   my_userbuf, my_lmsg_ctl_pointer->userbuf_length,
+						   allgather_eq_h,
+						   total_msg_posts,
+						   blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
+						   | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE
+					  );
+				msg_posted = true;
+				/* set the memory barrier to ensure completion
+				 * and signal I am done getting scatter data*/
+			MB();
+			my_ctl_pointer->flag = ready_flag;
+
+				wait_for_peers(my_rank, my_lmsg_ctl_pointer->n_sends, data_buffs,
+							ready_flag, sequence_number);
+
+            } else {
+                /* takes care of first level recurssive double */
+		length = parent_lmsg_ctl_pointer->length/
+                    (1<<(parent_lmsg_ctl_pointer->n_sends - 1));
+                my_lmsg_ctl_pointer->length = length;
+                my_lmsg_ctl_pointer->offset = parent_lmsg_ctl_pointer->offset;
+
+				local_offset = my_lmsg_ctl_pointer->offset;
+				remote_offset = my_lmsg_ctl_pointer->offset;
+
+
+				while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) {
+			opal_progress();
+			}
+
+				mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer,
+								parent_lmsg_ctl_pointer,local_offset,
+								remote_offset, length);
+
+				/* signal that I am done reading data from parent */
+		    MB();
+	        my_ctl_pointer->flag = ready_flag;
+			}
+
+            /* time for allgather phase */
+             input_args->status = ALLGATHER;
+
+            BASESMUMA_VERBOSE(5,("Completed %d found it from %d \n",my_rank,src));
+
+			while(ready_flag > parent_ctl_pointer->flag);
+
+			goto Allgather;
+        }
+	}
+
+	{
+	/* this is not who we are looking for,
+	 * mark as false positive so we don't
+	 * poll here again
+	 */
+		src_list[src_list_index] = -1;
+	matched = 0;
+	goto Probe;
+     }
+
+Allgather:
+
+	BASESMUMA_VERBOSE(5,(" %d Completed Scatter %d times \n", my_rank, completed_scatter));
+
+    /* zip it back up - we have already taken care of first level */
+    global_sg_offset = my_lmsg_ctl_pointer->offset;
+
+	/* first level of zip up */
+    length = 2 * fragment_size/pow_2;
+
+
+	if (!msg_posted) {
+		rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+						PTL_EQ_HANDLER_NONE, &allgather_eq_h);
+
+		/* Posting for all phases of recursive doubling */
+		extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0;
+		allgather_posts = pow_2_levels - 1;
+		total_msg_posts = allgather_posts + extra_src_posts ;
+
+
+		mca_bcol_basesmuma_portals_post_msg(cs, my_lmsg_ctl_pointer,
+						   my_userbuf, my_lmsg_ctl_pointer->userbuf_length,
+						   allgather_eq_h, total_msg_posts , blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
+						   | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE
+							);
+		msg_posted = true;
+	}
+
+
+	ready_flag++;
+    MB();
+    my_ctl_pointer->flag = ready_flag;
+
+	/*
+	 * Recursive doubling allgather implementation
+	 */
+    for( i = 1; i < pow_2_levels; i++) {
+        /* get my partner for this level */
+        partner = my_rank^(1<<i);
+        partner_ctl_pointer = data_buffs[partner].ctl_struct;
+        partner_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*)
+											data_buffs[partner].payload;
+
+
+		/* Block until partner is at this level of recursive-doubling stage */
+        while(!IS_SG_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
+            opal_progress();
+        }
+        assert(partner_ctl_pointer->flag >= ready_flag);
+
+		if (partner_lmsg_ctl_pointer->offset < my_lmsg_ctl_pointer->offset) {
+			global_sg_offset -= length;
+			local_sg_offset = global_sg_offset;
+		} else {
+			local_sg_offset = global_sg_offset + length;
+		}
+
+
+		BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d", partner, length));
+		mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, my_lmsg_ctl_pointer,
+								partner_lmsg_ctl_pointer,local_sg_offset,
+								local_sg_offset, length);
+
+		ready_flag++;
+		MB();
+	my_ctl_pointer->flag = ready_flag;
+
+		/* Block until partner completed this level of recursive-doubling stage */
+	while(!IS_SG_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
+		    opal_progress();
+        }
+
+        /*
+		 * Compute length for next recursive doubling phase
+		 */
+        length *= 2;
+    }
+
+
+	/* If I am source for non-power 2 children wait for them */
+	/* If I am secondary root then my partner would be real root
+	 * so no need for exchange of data with the extra partner */
+	extra_partner = my_rank + pow_2 ;
+	if ((extra_partner < group_size) && (!secondary_root)) {
+		volatile mca_bcol_basesmuma_header_t *extra_partner_ctl_pointer = NULL;
+
+		extra_partner_ctl_pointer = data_buffs[extra_partner].ctl_struct;
+		/* Block until extra partner has copied data */
+	while(!IS_SG_DATA_READY(extra_partner_ctl_pointer, ready_flag, sequence_number)) {
+		    opal_progress();
+        }
+
+	}
+
+Release:
+
+	/* free the event queue */
+	rc = PtlEQFree(allgather_eq_h);
+	if (rc != PTL_OK) {
+		BASESMUMA_VERBOSE(10,("PtlEQFree() failed: %d )\n",rc));
+	}
+
+    my_ctl_pointer->starting_flag_value++;
+    input_args->status = FINISHED;
+
+    return BCOL_FN_COMPLETE;
+
+}
+#endif
+
+/*
+ * Blocking Portals Scatter Allgather
+ *
+ *
+ *
+ *
+ *
+ */
+
+int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args)
+{
+
+    /* local variables */
+    int i;
+    uint64_t length;
+    int my_rank, parent_rank, src =-1, matched = 0;
+    int *src_list = NULL;
+    int group_size = -1, dummy_group_size;
+    int first_instance=0;
+    int rc = OMPI_SUCCESS;
+    int leading_dim, buff_idx, idx;
+    int count=input_args->count;
+    size_t pack_len = 0, dt_size =0 ;
+	volatile int8_t ready_flag;
+    int flag_offset;
+    int pow_2, pow_2_levels;
+    int src_list_index = -1;
+    uint64_t fragment_size;  /* user buffer size */
+	int sg_matchbits;
+
+	/* Input argument variables */
+	void *my_userbuf = (void*)((unsigned char*)input_args->userbuf);
+	int64_t sequence_number=input_args->sequence_num;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+
+	/* Extra source variables */
+	bool secondary_root = false;
+	int partner = -1, extra_partner = -1;
+
+	/* Scatter Allgather offsets */
+	uint64_t local_sg_offset = 0, global_sg_offset = 0, partner_offset = 0;
+
+	/* Portals messaging relevant variables */
+	mca_bcol_basesmuma_portal_proc_info_t *portals_info;
+	ptl_handle_eq_t allgather_eq_h;
+	ptl_event_t  allgather_event;
+	bool blocked_post = false;
+	bool msg_posted = false;
+	int total_msg_posts = -1, scatter_posts = -1, allgather_posts = -1, extra_src_posts = -1;
+
+	/* OMPI module and component variables */
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    mca_bcol_basesmuma_module_t *bcol_module =
+        (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+
+	/* Control structure and payload variables */
+	volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+    volatile mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer = NULL;
+	volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer = NULL; /* binomial fanout */
+    volatile mca_bcol_basesmuma_ctl_struct_t  *partner_ctl_pointer = NULL; /* recursive double */
+
+	/* Make sure there userbuffer is not null */
+	assert(my_userbuf != NULL);
+
+    /* Get portals info*/
+	portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
+
+	/* Get addresing information */
+    buff_idx = input_args->src_desc->buffer_index;
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    /* calculate the largest power of two that is smaller than
+     * or equal to the group size
+     */
+    pow_2_levels = pow_sm_k(2,group_size, &(dummy_group_size));
+    if( group_size < (1<<pow_2_levels)) {
+        pow_2_levels--;
+    }
+    /* power-of-two group size */
+    pow_2 = 1<<pow_2_levels;
+
+
+     /* we will work only on packed data - so compute the length*/
+    ompi_datatype_type_size(dtype, &dt_size);
+    fragment_size = count*dt_size;
+
+    /* grab the ctl buffs */
+    ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
+        bcol_module->colls_with_user_data.ctl_buffs+idx;
+
+
+    my_ctl_pointer = ctl_structs[my_rank];
+    if(my_ctl_pointer->sequence_number < sequence_number) {
+        first_instance = 1;
+    }
+
+    if(first_instance) {
+        for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
+            my_ctl_pointer->flags[i] = -1;
+        }
+        my_ctl_pointer->index = 1;
+
+        my_ctl_pointer->starting_flag_value = 0;
+        flag_offset = 0;
+
+    } else {
+        my_ctl_pointer->index++;
+    }
+
+
+    /* increment the starting flag by one and return */
+    flag_offset = my_ctl_pointer->starting_flag_value;
+    /*ready_flag = flag_offset + sequence_number + 1;*/
+    ready_flag = flag_offset + 1;
+
+    my_ctl_pointer->sequence_number = sequence_number;
+	sg_matchbits = sequence_number ;
+
+	/* Construct my portal buffer address and copy to payload buffer */
+	mca_bcol_basesmuma_construct_portal_address(&my_ctl_pointer->portals_buf_addr,
+						portals_info->portal_id.nid,
+						portals_info->portal_id.pid,
+						sg_matchbits,
+						bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
+
+	my_ctl_pointer->portals_buf_addr.userbuf = my_userbuf;
+	my_ctl_pointer->portals_buf_addr.userbuf_length = fragment_size;
+
+
+    if (input_args->root_flag) {
+        my_ctl_pointer->offset = 0;
+        my_ctl_pointer->n_sends = pow_2_levels;
+        my_ctl_pointer->length = fragment_size;
+
+	rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+						PTL_EQ_HANDLER_NONE, &allgather_eq_h);
+
+		if (rc != PTL_OK) {
+		BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc));
+			goto Release;
+	}
+
+		/* Compute number of posts required */
+		if (my_rank >= pow_2) {
+			/* I am root and my rank is greater than pow_2, I will hand
+			 * over to rank (that is < pow_2) to act as secondary root
+			 */
+			total_msg_posts  = 1;
+		}
+		else {
+
+			extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0;
+			scatter_posts = my_ctl_pointer->n_sends;
+			allgather_posts = pow_2_levels - 1;
+
+			total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+		}
+
+		 mca_bcol_basesmuma_portals_post_msg(cs,
+						 &my_ctl_pointer->portals_buf_addr,
+						   my_userbuf, fragment_size, allgather_eq_h,
+						   total_msg_posts,
+						   blocked_post,
+						  PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+		 msg_posted = true ;
+
+		 /* important that these be set before my children
+         * see the ready flag raised
+         */
+        MB();
+        my_ctl_pointer->flags[BCAST_FLAGS] = ready_flag;
+        BASESMUMA_VERBOSE(1,("I am the root(ctl_pointer %x) of the data flag value %d",my_ctl_pointer, my_ctl_pointer->flag));
+		/* Wait for my scatter partner */
+		if (my_rank >= pow_2) {
+			int scatter_partner = -1;
+			volatile mca_bcol_basesmuma_ctl_struct_t *scatter_partner_ctl_pointer = NULL;
+
+			scatter_partner = my_rank - pow_2;
+			scatter_partner_ctl_pointer =
+					ctl_structs[scatter_partner];
+
+			while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, ready_flag,
+									sequence_number)){
+SCATTER_WAIT_FOR_EXTRA:
+					opal_progress();
+			}
+
+			goto Release;
+		}
+		else {
+
+			wait_for_peers_nb(my_rank, my_ctl_pointer->n_sends, ctl_structs,
+							ready_flag, sequence_number);
+		}
+
+		goto Allgather;
+    }
+
+
+Extra :
+    if( my_rank >= pow_2 ) {
+        parent_rank = my_rank & (pow_2-1);
+        parent_ctl_pointer = ctl_structs[parent_rank];
+
+		ready_flag = ready_flag + pow_2_levels;
+
+		while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) {
+
+				opal_progress();
+
+        }
+
+
+		mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, &my_ctl_pointer->portals_buf_addr,
+								&parent_ctl_pointer->portals_buf_addr, 0,
+								0, fragment_size);
+
+		my_ctl_pointer->flags[BCAST_FLAG] = ready_flag;
+
+		goto Release;
+    }
+
+Scatter:
+
+    /* compute the list of possible sources */
+    src_list = (int *) malloc(sizeof(int) * (pow_2_levels + 1));
+    for( i = 0; i < pow_2_levels; i++) {
+        src_list[i] = my_rank ^ (1<<i);
+    }
+
+	/* My source might be process > pow_2 */
+	if ((my_rank + pow_2) < group_size) {
+            src_list[i] = my_rank + pow_2;
+    } else {
+            src_list[i] = -1;
+    }
+
+Probe:
+
+    /* If I am not the root, then poll on possible "senders'" control structs */
+	/* For portals we block for now */
+	while (!matched) {
+      /* Shared memory iprobe */
+      SG_LARGE_MSG_NB_PROBE(src_list, pow_2_levels + 1,
+                src_list_index, matched, src, ctl_structs,
+				parent_ctl_pointer, ready_flag, sequence_number);
+	}
+
+	BASESMUMA_VERBOSE(1,("Scatter : Im non-root match received"));
+	/* If I am a secondary root */
+	if ((matched) && (src == pow_2 + my_rank)) {
+		volatile mca_bcol_basesmuma_ctl_struct_t *extra_src_ctl_pointer = NULL;
+
+		secondary_root = true;
+        BASESMUMA_VERBOSE(10,("I am the secondary root for the data"));
+        my_ctl_pointer->offset = 0;
+        my_ctl_pointer->n_sends = pow_2_levels;
+        my_ctl_pointer->length = fragment_size;
+
+		extra_src_ctl_pointer = ctl_structs[src];
+
+		/* create an event queue for the incoming buffer */
+	rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+						PTL_EQ_HANDLER_NONE, &allgather_eq_h);
+
+		if (rc != PTL_OK) {
+		BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc));
+			goto Release;
+	}
+
+		mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs,
+						&my_ctl_pointer->portals_buf_addr,
+						&extra_src_ctl_pointer->portals_buf_addr, 0,
+						0, fragment_size);
+
+
+		extra_src_posts = 0;
+		scatter_posts = my_ctl_pointer->n_sends;
+		allgather_posts = pow_2_levels - 1;
+
+		total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+
+		mca_bcol_basesmuma_portals_post_msg(cs,
+						  &my_ctl_pointer->portals_buf_addr,
+						   my_userbuf, fragment_size, allgather_eq_h,
+						   total_msg_posts,
+						   blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET
+						   | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+		msg_posted = true ;
+
+		/* important that these be set before my children
+         * see the ready flag raised
+         */
+        MB();
+        my_ctl_pointer->flags[BCAST_FLAG] = ready_flag;
+
+		wait_for_peers_nb(my_rank, my_ctl_pointer->n_sends, ctl_structs,
+							ready_flag, sequence_number);
+		goto Allgather;
+    }
+
+    /* we need to see whether this is really
+     * who we are looking for
+     */
+    for( i = 0; i < parent_ctl_pointer->n_sends; i++) {
+		uint64_t local_offset = 0;
+		uint64_t remote_offset = 0;
+
+		BASESMUMA_VERBOSE(1,("%d found it from %d \n",my_rank,src));
+
+       if( my_rank == (src^(1<<i))) {
+            parent_ctl_pointer = ctl_structs[src];
+
+            /* we found our root within the group ... */
+            BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
+
+			my_ctl_pointer->n_sends = i;
+
+		    /* Am I source for other process during scatter phase */
+            if ( i > 0) {
+
+                /* compute the size of the chunk to copy */
+                length = (parent_ctl_pointer->length)/
+                    (1<<(parent_ctl_pointer->n_sends - my_ctl_pointer->n_sends));
+                my_ctl_pointer->length = length;
+                my_ctl_pointer->offset =
+				parent_ctl_pointer->offset + length;
+
+
+				local_offset = my_ctl_pointer->offset;
+				remote_offset = parent_ctl_pointer->offset + length;
+
+				mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs, &my_ctl_pointer->portals_buf_addr,
+								&parent_ctl_pointer->portals_buf_addr,local_offset,
+								remote_offset, length);
+				rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+									cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+													PTL_EQ_HANDLER_NONE,
+													&allgather_eq_h);
+
+				if (rc != PTL_OK) {
+				BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d \n",rc));
+					goto Release;
+			}
+
+				/* Now post the message for other children to read */
+				extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0;
+				scatter_posts = my_ctl_pointer->n_sends;
+				allgather_posts = pow_2_levels - 1;
+
+				total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+
+
+				mca_bcol_basesmuma_portals_post_msg(cs, &my_ctl_pointer->portals_buf_addr,
+						   my_userbuf, my_ctl_pointer->portals_buf_addr.userbuf_length,
+						   allgather_eq_h,
+						   total_msg_posts,
+						   blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
+						   | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE
+					  );
+				msg_posted = true;
+				/* set the memory barrier to ensure completion */
+			MB();
+		/* signal that I am done */
+			my_ctl_pointer->flags[BCAST_FLAG] = ready_flag;
+
+				wait_for_peers_nb(my_rank, my_ctl_pointer->n_sends, ctl_structs,
+							ready_flag, sequence_number);
+
+            } else {
+                /* takes care of first level recurssive double */
+		length = parent_ctl_pointer->length/
+                    (1<<(parent_ctl_pointer->n_sends - 1));
+                my_ctl_pointer->length = length;
+                my_ctl_pointer->offset = parent_ctl_pointer->offset;
+
+				local_offset = my_ctl_pointer->offset;
+				remote_offset = my_ctl_pointer->offset;
+
+
+				while(!IS_SG_DATA_READY(parent_ctl_pointer, ready_flag, sequence_number)) {
+			opal_progress();
+			}
+
+				mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs,
+								&my_ctl_pointer->portals_buf_addr,
+								&parent_ctl_pointer->portals_buf_addr, local_offset,
+								remote_offset, length);
+
+				/* signal that I am done reading data from parent */
+		    MB();
+	        my_ctl_pointer->flags[BCAST_FLAG] = ready_flag;
+			}
+
+            /* time for allgather phase */
+             input_args->status = ALLGATHER;
+
+            BASESMUMA_VERBOSE(5,("Completed %d found it from %d \n",my_rank,src));
+
+			while(ready_flag > parent_ctl_pointer->flags[BCAST_FLAG]);
+
+			goto Allgather;
+        }
+	}
+
+	{
+	/* this is not who we are looking for,
+	 * mark as false positive so we don't
+	 * poll here again
+	 */
+		src_list[src_list_index] = -1;
+	matched = 0;
+	goto Probe;
+     }
+
+Allgather:
+
+	BASESMUMA_VERBOSE(5,(" %d Completed Scatter %d times \n", my_rank, completed_scatter));
+
+    /* zip it back up - we have already taken care of first level */
+    global_sg_offset = my_ctl_pointer->offset;
+
+	/* first level of zip up */
+    length = 2 * fragment_size/pow_2;
+
+
+	if (!msg_posted) {
+		rc = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+						PTL_EQ_HANDLER_NONE, &allgather_eq_h);
+
+		/* Posting for all phases of recursive doubling */
+		extra_src_posts = (my_rank + pow_2 < group_size ) ? 1: 0;
+		allgather_posts = pow_2_levels - 1;
+		total_msg_posts = allgather_posts + extra_src_posts ;
+
+
+		mca_bcol_basesmuma_portals_post_msg(cs, &my_ctl_pointer->portals_buf_addr,
+						   my_userbuf, my_ctl_pointer->portals_buf_addr.userbuf_length,
+						   allgather_eq_h, total_msg_posts , blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
+						   | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE
+							);
+		msg_posted = true;
+	}
+
+	ready_flag++;
+    MB();
+    my_ctl_pointer->flags[BCAST_FLAG] = ready_flag;
+
+    for( i = 1; i < pow_2_levels; i++) {
+        /* get my partner for this level */
+        partner = my_rank^(1<<i);
+        partner_ctl_pointer =ctl_structs[partner];
+
+
+		/* Block until partner is at this level of recursive-doubling stage */
+        while(!IS_SG_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
+            opal_progress();
+        }
+        assert(partner_ctl_pointer->flags[BCAST_FLAG] >= ready_flag);
+
+		if (partner_ctl_pointer->offset < my_ctl_pointer->offset) {
+			global_sg_offset -= length;
+			local_sg_offset = global_sg_offset;
+		} else {
+			local_sg_offset = global_sg_offset + length;
+		}
+
+
+		BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d", partner, length));
+		mca_bcol_basesmuma_portals_get_msg_fragment_no_eq_h(cs,
+								&my_ctl_pointer->portals_buf_addr,
+								&partner_ctl_pointer->portals_buf_addr,local_sg_offset,
+								local_sg_offset, length);
+
+		ready_flag++;
+		MB();
+	my_ctl_pointer->flags[BCAST_FLAG] = ready_flag;
+
+		/* Block until partner is at this level of recursive-doubling stage */
+	while(!IS_SG_DATA_READY(partner_ctl_pointer, ready_flag, sequence_number)) {
+		    opal_progress();
+        }
+
+        /* double the length */
+        length *= 2;
+    }
+
+
+	/* If I am source for non-power 2 children wait for them */
+	/* If I am secondary root then my partner would be real root
+	 * so no need for exchange of data with the extra partner */
+	extra_partner = my_rank + pow_2 ;
+	if ((extra_partner < group_size) && (!secondary_root)) {
+		volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer = NULL;
+
+		extra_partner_ctl_pointer = ctl_structs[extra_partner];
+		/* Block until extra partner has copied data */
+	while(!IS_SG_DATA_READY(extra_partner_ctl_pointer, ready_flag, sequence_number)) {
+		    opal_progress();
+        }
+
+	}
+
+Release:
+
+	/* free the event queue */
+	rc = PtlEQFree(allgather_eq_h);
+	if (rc != PTL_OK) {
+		BASESMUMA_VERBOSE(10,("PtlEQFree() failed: %d )\n",rc));
+	}
+
+    my_ctl_pointer->starting_flag_value++;
+    input_args->status = FINISHED;
+
+    return BCOL_FN_COMPLETE;
+
+}
+
+
+/*
+ * static sg_state_t *sg_state = NULL;
+ */
+
+int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args)
+{
+	int i;
+	mca_bcol_basesmuma_portal_proc_info_t *portals_info;
+	int dummy_group_size;
+    int rc = OMPI_SUCCESS;
+	int buff_idx;
+	int count=input_args->count;
+    size_t pack_len = 0, dt_size =0 ;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+	int completed_posts = 0;
+	sg_state_t *sg_state = NULL;
+	mca_bcol_basesmuma_module_t *bcol_module = NULL;
+	int extra_src_posts = -1,allgather_posts = -1, total_msg_posts = -1;
+
+    bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+	/*
+	sg_state = (sg_state_t*)bcol_module->sg_state;
+     */
+	sg_state = (sg_state_t*)&(bcol_module->sg_state);
+	/* Re-entering the algorithm */
+   switch (sg_state->phase) {
+	case PROBE:
+		 if (input_args->root_flag) {
+			/* I became a root for this group */
+			sg_state->phase = START;
+			goto Start;
+		 }
+		 goto Probe;
+		 break;
+
+	case SCATTER_ROOT_WAIT:
+		 goto Scatter_root_wait;
+
+	case SCATTER_EXTRA_ROOT_WAIT:
+		 goto Scatter_extra_root_wait;
+
+	case SCATTER_PARENT_WAIT:
+		 goto Scatter_parent_wait;
+
+	default:
+		break;
+   }
+
+	sg_state->phase = INIT;
+
+	BASESMUMA_VERBOSE(1,("Im entering portals_nb_bcast Unknown root "));
+	/* Allocate space for algorithm state */
+	/*
+	sg_state = (sg_state_t *) malloc(sizeof(sg_state_t));
+	bcol_module->sg_state = (void *)sg_state;
+
+	assert(NULL != sg_state);
+	*/
+
+	sg_state->secondary_root = false;
+	sg_state->msg_posted = false;
+	sg_state->matched = 0;
+	sg_state->phase = SCATTER;
+	/* Copy input args to local variables */
+	sg_state->my_userbuf = (void*)((unsigned char*)input_args->userbuf);
+	assert(sg_state->my_userbuf != NULL);
+	sg_state->sequence_number=input_args->sequence_num;
+	sg_state->cs = &mca_bcol_basesmuma_component;
+    sg_state->bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+	/* Should this be buffer index (ML) or control buffer index ? */
+    buff_idx = input_args->src_desc->buffer_index;
+
+	/* Initialize SM group info used for control signaling */
+	init_sm_group_info(sg_state, buff_idx);
+
+    /* calculate the largest power of two that is smaller than
+     * or equal to the group size
+     */
+    sg_state->pow_2_levels = pow_sm_k(2, sg_state->group_size, &(dummy_group_size));
+    if( sg_state->group_size < (1 << sg_state->pow_2_levels)) {
+        sg_state->pow_2_levels--;
+    }
+    /* power-of-two group size */
+    sg_state->pow_2 = 1 << sg_state->pow_2_levels;
+
+
+    /* we will work only on packed data - so compute the length*/
+    ompi_datatype_type_size(dtype, &dt_size);
+    sg_state->fragment_size = count*dt_size;
+
+
+	/* Init portals scatter allgather info */
+	rc = init_sm_portals_sg_info(sg_state);
+
+	if (rc != OMPI_SUCCESS) {
+		goto Release;
+	}
+
+Start :
+Extra :
+    /*
+	 *  My rank >  pow2 groupsize
+	 */
+    if( sg_state->my_rank >= sg_state->pow_2 ) {
+
+		if (input_args->root_flag){
+
+			rc = sm_portals_extra_root_scatter(sg_state);
+			if (rc != OMPI_SUCCESS) {
+				goto Release;
+			}
+
+		} else {
+			/*
+			 * Wait for my partner to receive bcast data, and copy from it
+			 */
+			int extra_parent_rank;
+			volatile mca_bcol_basesmuma_ctl_struct_t  *extra_parent_ctl_pointer = NULL; /* binomial fanout */
+		extra_parent_rank = sg_state->my_rank & (sg_state->pow_2-1);
+		extra_parent_ctl_pointer = sg_state->ctl_structs[extra_parent_rank];
+
+			sg_state->ready_flag = sg_state->ready_flag + sg_state->pow_2_levels;
+
+			while(!IS_SG_DATA_READY(extra_parent_ctl_pointer, sg_state->ready_flag,
+								sg_state->sequence_number)) {
+				opal_progress();
+
+		}
+
+			mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
+								sg_state->read_eq,
+								&sg_state->my_ctl_pointer->portals_buf_addr,
+								&extra_parent_ctl_pointer->portals_buf_addr, 0,
+								0, sg_state->fragment_size);
+
+			sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+		}
+
+		goto Release;
+    }
+
+    if (input_args->root_flag) {
+
+		BASESMUMA_VERBOSE(1,("Scatter : Im root (bcol_module %x,ctl_pointer %x) my ready flag %d \n",
+									sg_state->bcol_module, sg_state->my_ctl_pointer, sg_state->ready_flag));
+		rc = sm_portals_root_scatter(sg_state);
+
+		/* gvm Fix: Redudant
+		MB();
+		*/
+
+		sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+		if (rc != OMPI_SUCCESS) {
+			goto Release;
+		}
+
+Scatter_root_wait:
+
+		BASESMUMA_VERBOSE(5,("Scatter: Im root waiting for children to complete my flag %d",
+									sg_state->my_ctl_pointer->flag));
+
+		for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends;
+						i++) {
+
+			completed_posts = wait_for_post_complete_nb(sg_state->my_rank,
+								sg_state->my_ctl_pointer->n_sends, sg_state->ctl_structs,
+								sg_state->ready_flag, sg_state->sequence_number);
+
+		}
+
+		if (completed_posts < sg_state->my_ctl_pointer->n_sends) {
+			sg_state->phase = SCATTER_ROOT_WAIT;
+			return  BCOL_FN_STARTED;
+		}
+
+		goto Allgather;
+    }
+
+
+Scatter:
+
+	BASESMUMA_VERBOSE(1,("Scatter : Im non-root probing for data "));
+    /* compute the list of possible sources */
+	/*
+    sg_state->src_list = (int *) malloc(sizeof(int) * (sg_state->pow_2_levels + 1));
+	*/
+	assert(MAX_SM_GROUP_SIZE > sg_state->pow_2_levels+1);
+
+    for( i = 0; i < sg_state->pow_2_levels; i++) {
+        sg_state->src_list[i] = sg_state->my_rank ^ (1<<i);
+    }
+
+	/* My source might be process > pow_2 */
+
+	if ((sg_state->my_rank + sg_state->pow_2) < sg_state->group_size) {
+            sg_state->src_list[i] = sg_state->my_rank + sg_state->pow_2;
+    } else {
+            sg_state->src_list[i] = -1;
+    }
+
+
+	BASESMUMA_VERBOSE(1,("Scatter : Ready flag %d Im  non-root probing for %d procs %d:%d \n",
+					sg_state->ready_flag,sg_state->pow_2_levels,sg_state->src_list[0],sg_state->src_list[1]));
+Probe:
+    /* If I am not the root, then poll on possible "senders'" control structs */
+	/* For portals we block for now */
+    /* Shared memory iprobe */
+
+
+	/*
+		SG_LARGE_MSG_NB_PROBE(sg_state->src_list, sg_state->pow_2_levels + 1,
+                sg_state->src_list_index, sg_state->matched, sg_state->src,
+				sg_state->ctl_structs,
+				sg_state->parent_ctl_pointer, sg_state->ready_flag, sg_state->sequence_number);
+	 */
+
+	for( i = 0; i < sg_state->cs->num_to_probe && 0 == sg_state->matched;
+						i++) {
+		sg_large_msg_probe(sg_state);
+	}
+
+	if (!sg_state->matched) {
+		sg_state->phase = PROBE;
+		return BCOL_FN_STARTED;
+	}
+
+	BASESMUMA_VERBOSE(1,("Scatter : Im non-root match received"));
+	/* If I am a secondary root */
+	if ((sg_state->matched) && (sg_state->src == sg_state->pow_2 + sg_state->my_rank)) {
+
+		BASESMUMA_VERBOSE(5,("Scatter : Im secondary root \n"));
+
+		rc = sm_portals_secondary_root_scatter(sg_state);
+		if (rc != OMPI_SUCCESS) {
+				goto Release;
+		}
+
+Scatter_extra_root_wait:
+
+		for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends;
+						i++) {
+
+			completed_posts = wait_for_post_complete_nb(sg_state->my_rank, sg_state->my_ctl_pointer->n_sends,
+							sg_state->ctl_structs, sg_state->ready_flag, sg_state->sequence_number);
+
+		}
+
+		if (completed_posts < sg_state->my_ctl_pointer->n_sends) {
+			sg_state->phase = SCATTER_EXTRA_ROOT_WAIT;
+			return  BCOL_FN_STARTED;
+		}
+
+		goto Allgather;
+    }
+
+    /* we need to see whether this is really
+     * who we are looking for
+     */
+    for( i = 0; i < sg_state->parent_ctl_pointer->n_sends; i++) {
+		uint64_t local_offset = 0;
+		uint64_t remote_offset = 0;
+
+		BASESMUMA_VERBOSE(5,("%d found it from %d \n",sg_state->my_rank,sg_state->src));
+
+       if( sg_state->my_rank == (sg_state->src^(1<<i))) {
+            sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src];
+
+            /* we found our root within the group ... */
+            BASESMUMA_VERBOSE(5,("Shared memory probe was matched, the root is	%d ",sg_state->src));
+
+			sg_state->my_ctl_pointer->n_sends = i;
+
+		    /* Am I source for other process during scatter phase */
+            if ( i > 0) {
+				BASESMUMA_VERBOSE(1,("Scatter : Im Internal node \n"));
+
+				rc = sm_portals_internode_scatter(sg_state);
+
+				if (rc != OMPI_SUCCESS) {
+					goto Release;
+				}
+
+Scatter_parent_wait:
+
+				for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends;
+						i++) {
+
+					completed_posts = wait_for_post_complete_nb(sg_state->my_rank,
+								sg_state->my_ctl_pointer->n_sends,
+								sg_state->ctl_structs,
+								sg_state->ready_flag, sg_state->sequence_number);
+				}
+
+				if (completed_posts < sg_state->my_ctl_pointer->n_sends) {
+					sg_state->phase = SCATTER_PARENT_WAIT;
+					return  BCOL_FN_STARTED;
+				}
+
+            } else {
+
+				BASESMUMA_VERBOSE(1,("Scatter : Im leaf node \n"));
+
+				/* takes care of first level recurssive double */
+		sg_state->length = sg_state->parent_ctl_pointer->length/
+                    (1<<(sg_state->parent_ctl_pointer->n_sends - 1));
+                sg_state->my_ctl_pointer->length = sg_state->length;
+                sg_state->my_ctl_pointer->offset = sg_state->parent_ctl_pointer->offset;
+
+
+				while(!IS_SG_DATA_READY(sg_state->parent_ctl_pointer,
+										sg_state->ready_flag, sg_state->sequence_number)) {
+			opal_progress();
+			}
+
+				mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
+								sg_state->read_eq,
+								&sg_state->my_ctl_pointer->portals_buf_addr,
+								&sg_state->parent_ctl_pointer->portals_buf_addr,
+								sg_state->my_ctl_pointer->offset,
+								sg_state->my_ctl_pointer->offset, sg_state->length);
+
+				/* signal that I am done reading data from parent */
+				/*
+		    MB();
+				*/
+	        sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+			}
+
+            BASESMUMA_VERBOSE(1,("Completed %d found it from %d \n",
+									sg_state->my_rank, sg_state->src));
+
+			while(sg_state->ready_flag > sg_state->parent_ctl_pointer->flag);
+
+			goto Allgather;
+        }
+	}
+
+	{
+	/* this is not who we are looking for,
+	 * mark as false positive so we don't
+	 * poll here again
+	 */
+		sg_state->src_list[sg_state->src_list_index] = -1;
+	sg_state->matched = 0;
+	goto Probe;
+     }
+
+Allgather:
+
+	BASESMUMA_VERBOSE(5,("Completed Scatter phase"));
+
+    /* zip it back up - we have already taken care of first level */
+    sg_state->global_sg_offset = sg_state->my_ctl_pointer->offset;
+
+	/* first level of zip up */
+    sg_state->length = 2 * sg_state->fragment_size/sg_state->pow_2;
+
+
+	/* Posting for all phases of recursive doubling */
+	extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0;
+	allgather_posts = sg_state->pow_2_levels - 1;
+	total_msg_posts = allgather_posts + extra_src_posts ;
+
+	if ((!sg_state->msg_posted) && (total_msg_posts > 0)){
+
+			mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr,
+						   sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length,
+						   PTL_EQ_NONE, total_msg_posts, blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
+						   | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE
+							);
+			sg_state->msg_posted = true;
+	}
+
+	BASESMUMA_VERBOSE(5,("Done with allgather phase"));
+	/* I reached an allgather phase */
+	sg_state->ready_flag++;
+    MB();
+    sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+	rc = sm_portals_bcasts_allgather_phase(sg_state);
+
+	if (rc != OMPI_SUCCESS) {
+		BASESMUMA_VERBOSE(10,("Error in Bcast's allgather phase "));
+		goto Release;
+	}
+
+	/* If I am source for non-power 2 children wait for them */
+	/* If I am secondary root then my partner would be real root
+	 * so no need for exchange of data with the extra partner */
+	sg_state->extra_partner = sg_state->my_rank + sg_state->pow_2 ;
+	if ((sg_state->extra_partner < sg_state->group_size) && (!sg_state->secondary_root)) {
+
+		sg_state->extra_partner_ctl_pointer = sg_state->ctl_structs[sg_state->extra_partner];
+		/* Block until extra partner has copied data */
+	while(!IS_SG_DATA_READY(sg_state->extra_partner_ctl_pointer,
+								sg_state->ready_flag, sg_state->sequence_number)) {
+		    opal_progress();
+        }
+
+	}
+
+Release:
+
+    BASESMUMA_VERBOSE(1,("Im done "));
+
+    sg_state->my_ctl_pointer->starting_flag_value++;
+    sg_state->phase = FINISHED;
+
+
+	return BCOL_FN_COMPLETE;
+
+}
+
+
+int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args)
+{
+
+	int i;
+	mca_bcol_basesmuma_portal_proc_info_t *portals_info;
+	int dummy_group_size;
+    int rc = OMPI_SUCCESS;
+	int buff_idx;
+	int count=input_args->count;
+    size_t pack_len = 0, dt_size =0 ;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+	int completed_posts = 0;
+	sg_state_t *sg_state = NULL;
+    mca_bcol_basesmuma_module_t *bcol_module=NULL;
+	int extra_src_posts = -1,allgather_posts = -1, total_msg_posts = -1;
+    bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;
+
+	sg_state = (sg_state_t*)(&bcol_module->sg_state);
+
+	BASESMUMA_VERBOSE(1,("Im entering nb_knownroot_bcast bcol = %x ",
+							c_input_args->bcol_module));
+	/* Re-entering the algorithm */
+	switch (sg_state->phase) {
+		case PROBE:
+			 if (input_args->root_flag) {
+				/* I became a root for this group */
+				sg_state->phase = START;
+				goto Start;
+			 }
+			 goto Probe;
+			 break;
+
+		case SCATTER_ROOT_WAIT:
+			 goto Scatter_root_wait;
+
+		case SCATTER_EXTRA_ROOT_WAIT:
+			 goto Scatter_extra_root_wait;
+
+		case SCATTER_PARENT_WAIT:
+			 goto Scatter_parent_wait;
+
+		default:
+			break;
+	}
+
+	/* Allocate space for algorithm state */
+	/*
+	sg_state = (sg_state_t *) malloc(sizeof(sg_state_t));
+	bcol_module->sg_state = (void*) sg_state;
+	*/
+
+	/* Make sure there userbuffer is not null */
+
+	sg_state->phase = INIT;
+	sg_state->secondary_root = false;
+	sg_state->msg_posted = false;
+	sg_state->matched = 0;
+	/* Copy input args to local variables */
+	sg_state->my_userbuf = (void*)((unsigned char*)input_args->userbuf);
+	assert(sg_state->my_userbuf != NULL);
+	sg_state->sequence_number=input_args->sequence_num;
+	sg_state->cs = &mca_bcol_basesmuma_component;
+	sg_state->bcol_module = bcol_module;
+    buff_idx = input_args->src_desc->buffer_index;
+
+	/* Initialize SM group info used for control signaling */
+	init_sm_group_info(sg_state, buff_idx);
+
+    /* calculate the largest power of two that is smaller than
+     * or equal to the group size
+     */
+    sg_state->pow_2_levels = pow_sm_k(2, sg_state->group_size, &(dummy_group_size));
+    if( sg_state->group_size < (1 << sg_state->pow_2_levels)) {
+        sg_state->pow_2_levels--;
+    }
+    /* power-of-two group size */
+    sg_state->pow_2 = 1 << sg_state->pow_2_levels;
+
+
+     /* we will work only on packed data - so compute the length*/
+    ompi_datatype_type_size(dtype, &dt_size);
+    sg_state->fragment_size = count*dt_size;
+
+
+	/* Init portals scatter allgather info */
+	rc = init_sm_portals_sg_info(sg_state);
+
+	if (rc != OMPI_SUCCESS) {
+		goto Release;
+	}
+Start:
+Extra :
+    /*
+	 *  My rank >  pow2 groupsize
+	 */
+    if( sg_state->my_rank >= sg_state->pow_2 ) {
+
+		if (input_args->root_flag){
+
+			rc = sm_portals_extra_root_scatter(sg_state);
+			if (rc != OMPI_SUCCESS) {
+				goto Release;
+			}
+
+		} else {
+			/*
+			 * Wait for my partner to receive bcast data, and copy from it
+			 */
+			int extra_parent_rank;
+			volatile mca_bcol_basesmuma_ctl_struct_t  *extra_parent_ctl_pointer = NULL; /* binomial fanout */
+		extra_parent_rank = sg_state->my_rank & (sg_state->pow_2-1);
+		extra_parent_ctl_pointer = sg_state->ctl_structs[extra_parent_rank];
+
+			sg_state->ready_flag = sg_state->ready_flag + sg_state->pow_2_levels;
+
+			while(!IS_SG_DATA_READY(extra_parent_ctl_pointer, sg_state->ready_flag,
+								sg_state->sequence_number)) {
+				opal_progress();
+
+		}
+
+			mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
+								sg_state->read_eq,
+								&sg_state->my_ctl_pointer->portals_buf_addr,
+								&extra_parent_ctl_pointer->portals_buf_addr, 0,
+								0, sg_state->fragment_size);
+
+			sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+		}
+
+		goto Release;
+    }
+
+    if (input_args->root_flag) {
+
+		BASESMUMA_VERBOSE(1,("Scatter : Im root (bcol_module %x,ctl_pointer %x) my ready flag %d \n",
+									bcol_module, sg_state->my_ctl_pointer, sg_state->ready_flag));
+		rc = sm_portals_root_scatter(sg_state);
+
+		sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+		if (rc != OMPI_SUCCESS) {
+			goto Release;
+		}
+
+Scatter_root_wait:
+
+		BASESMUMA_VERBOSE(5,("Scatter: Im root waiting for children to complete my flag %d",
+									sg_state->my_ctl_pointer->flag));
+		for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends;
+						i++) {
+			completed_posts = wait_for_post_complete_nb(sg_state->my_rank,
+								sg_state->my_ctl_pointer->n_sends, sg_state->ctl_structs,
+								sg_state->ready_flag, sg_state->sequence_number);
+		}
+
+		if (completed_posts < sg_state->my_ctl_pointer->n_sends) {
+			sg_state->phase = SCATTER_ROOT_WAIT;
+			return  BCOL_FN_STARTED;
+		}
+
+		goto Allgather;
+    }
+
+
+Probe:
+
+	sg_state->src = compute_src_from_root(input_args->root_route->rank, sg_state->my_rank,
+					sg_state->pow_2, sg_state->group_size);
+
+	sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src];
+
+	while(!IS_SG_DATA_READY(sg_state->parent_ctl_pointer, sg_state->ready_flag,
+								sg_state->sequence_number)) {
+				opal_progress();
+
+    }
+	sg_state->matched = true;
+
+	/* If I am a secondary root */
+	if ((sg_state->matched) && (sg_state->src == sg_state->pow_2 + sg_state->my_rank)) {
+
+		rc = sm_portals_secondary_root_scatter(sg_state);
+		if (rc != OMPI_SUCCESS) {
+				goto Release;
+		}
+Scatter_extra_root_wait:
+
+		for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends;
+						i++) {
+
+			completed_posts = wait_for_post_complete_nb(sg_state->my_rank, sg_state->my_ctl_pointer->n_sends,
+							sg_state->ctl_structs, sg_state->ready_flag, sg_state->sequence_number);
+
+		}
+
+		if (completed_posts < sg_state->my_ctl_pointer->n_sends) {
+			sg_state->phase = SCATTER_EXTRA_ROOT_WAIT;
+			return  BCOL_FN_STARTED;
+		}
+
+		goto Allgather;
+    }
+
+    /* we need to see whether this is really
+     * who we are looking for
+     */
+    for( i = 0; i < sg_state->parent_ctl_pointer->n_sends; i++) {
+		uint64_t local_offset = 0;
+		uint64_t remote_offset = 0;
+
+		BASESMUMA_VERBOSE(5,("%d found it from %d \n",sg_state->my_rank,sg_state->src));
+
+       if( sg_state->my_rank == (sg_state->src^(1<<i))) {
+            sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src];
+
+            /* we found our root within the group ... */
+            BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is	%d ",sg_state->src));
+
+			sg_state->my_ctl_pointer->n_sends = i;
+
+		    /* Am I source for other process during scatter phase */
+            if ( i > 0) {
+
+				rc = sm_portals_internode_scatter(sg_state);
+
+				if (rc != OMPI_SUCCESS) {
+					goto Release;
+				}
+Scatter_parent_wait:
+
+				for( i = 0; i < sg_state->cs->num_to_probe && completed_posts < sg_state->my_ctl_pointer->n_sends;
+						i++) {
+
+					completed_posts = wait_for_post_complete_nb(sg_state->my_rank,
+								sg_state->my_ctl_pointer->n_sends,
+								sg_state->ctl_structs,
+								sg_state->ready_flag, sg_state->sequence_number);
+				}
+
+				if (completed_posts < sg_state->my_ctl_pointer->n_sends) {
+					sg_state->phase = SCATTER_PARENT_WAIT;
+					return  BCOL_FN_STARTED;
+				}
+
+            } else {
+
+                /* takes care of first level recursive double */
+		sg_state->length = sg_state->parent_ctl_pointer->length/
+                    (1<<(sg_state->parent_ctl_pointer->n_sends - 1));
+                sg_state->my_ctl_pointer->length = sg_state->length;
+                sg_state->my_ctl_pointer->offset = sg_state->parent_ctl_pointer->offset;
+
+
+				while(!IS_SG_DATA_READY(sg_state->parent_ctl_pointer,
+										sg_state->ready_flag, sg_state->sequence_number)) {
+			opal_progress();
+			}
+
+				mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
+								sg_state->read_eq,
+								&sg_state->my_ctl_pointer->portals_buf_addr,
+								&sg_state->parent_ctl_pointer->portals_buf_addr,
+								sg_state->my_ctl_pointer->offset,
+								sg_state->my_ctl_pointer->offset, sg_state->length);
+
+				/* signal that I am done reading data from parent */
+	        sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+			}
+
+            BASESMUMA_VERBOSE(5,("Completed %d found it from %d \n",
+									sg_state->my_rank, sg_state->src));
+
+			while(sg_state->ready_flag > sg_state->parent_ctl_pointer->flag);
+
+			goto Allgather;
+        }
+	}
+
+	{
+	/* this is not who we are looking for,
+	 * mark as false positive so we don't
+	 * poll here again
+	 */
+		sg_state->src_list[sg_state->src_list_index] = -1;
+	sg_state->matched = 0;
+	goto Probe;
+     }
+
+Allgather:
+
+    /* zip it back up - we have already taken care of first level */
+    sg_state->global_sg_offset = sg_state->my_ctl_pointer->offset;
+
+	/* first level of zip up */
+    sg_state->length = 2 * sg_state->fragment_size/sg_state->pow_2;
+
+	/* Posting for all phases of recursive doubling */
+	extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0;
+	allgather_posts = sg_state->pow_2_levels - 1;
+	total_msg_posts = allgather_posts + extra_src_posts ;
+
+	if ((!sg_state->msg_posted) && (total_msg_posts > 0)){
+
+			mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr,
+						   sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length,
+						   PTL_EQ_NONE, total_msg_posts, blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
+						   | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE
+							);
+			sg_state->msg_posted = true;
+	}
+
+	sg_state->ready_flag++;
+    MB();
+    sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+	rc = sm_portals_bcasts_allgather_phase(sg_state);
+
+	if (rc != OMPI_SUCCESS) {
+		BASESMUMA_VERBOSE(10,("Error in Bcast's allgather phase "));
+		goto Release;
+	}
+
+	/* If I am source for non-power 2 children wait for them */
+	/* If I am secondary root then my partner would be real root
+	 * so no need for exchange of data with the extra partner */
+	sg_state->extra_partner = sg_state->my_rank + sg_state->pow_2 ;
+	if ((sg_state->extra_partner < sg_state->group_size) && (!sg_state->secondary_root)) {
+
+		sg_state->extra_partner_ctl_pointer = sg_state->ctl_structs[sg_state->extra_partner];
+		/* Block until extra partner has copied data */
+	while(!IS_SG_DATA_READY(sg_state->extra_partner_ctl_pointer,
+								sg_state->ready_flag, sg_state->sequence_number)) {
+		    opal_progress();
+        }
+
+	}
+
+Release:
+
+    BASESMUMA_VERBOSE(1,("Im done "));
+
+    sg_state->my_ctl_pointer->starting_flag_value++;
+    sg_state->phase = FINISHED;
+
+	return BCOL_FN_COMPLETE;
+
+}
+#endif /* __PORTALS_AVAIL__ */
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.h
new file mode 100644
index 0000000000..f929f2f7bf
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_bcast.h
@@ -0,0 +1,626 @@
+#ifdef __PORTALS_AVAIL__
+#define __PORTALS_ENABLE__
+
+#include <unistd.h>
+
+#include "ompi_config.h"
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+
+#include "bcol_basesmuma_utils.h"
+#include "bcol_basesmuma_portals.h"
+#include "bcol_basesmuma.h"
+
+#if 0
+struct scatter_allgather_nb_bcast_state_t
+{
+    /* local variables */
+    uint64_t length;
+    int my_rank, src, matched;
+    int *src_list;
+    int group_size;
+	int64_t ready_flag;
+    int pow_2, pow_2_levels;
+    int src_list_index;
+    uint64_t fragment_size;  /* user buffer size */
+
+	/* Input argument variables */
+	void *my_userbuf;
+	int64_t sequence_number;
+
+	/* Extra source variables */
+	bool secondary_root;
+	int partner , extra_partner;
+
+	/* Scatter Allgather offsets */
+	uint64_t local_sg_offset , global_sg_offset , partner_offset ;
+
+	/* Portals messaging relevant variables */
+	ptl_handle_eq_t allgather_eq_h;
+	ptl_handle_eq_t read_eq;
+	ptl_event_t  allgather_event;
+	bool msg_posted;
+
+	/* OMPI module and component variables */
+    mca_bcol_basesmuma_component_t *cs;
+    mca_bcol_basesmuma_module_t *bcol_module;
+
+	/* Control structure and payload variables */
+	volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+    volatile mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer;
+	volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer; /* scatter source */
+	volatile mca_bcol_basesmuma_ctl_struct_t  *extra_partner_ctl_pointer; /* scatter source */
+
+	int phase;
+};
+
+typedef struct scatter_allgather_nb_bcast_state_t sg_state_t;
+#endif
+
+bool blocked_post = false;
+
+#define IS_SG_DATA_READY(peer, my_flag, my_sequence_number) 								\
+    (((peer)->sequence_number == (my_sequence_number) && 									\
+      (peer)->flags[BCAST_FLAG] >= (my_flag) 															\
+     )? true : false )
+
+
+
+#define  SG_LARGE_MSG_PROBE(src_list, n_src, src_list_index, matched,								\
+						    src, data_buffs, data_src_ctl_pointer,							\
+							data_src_lmsg_ctl_pointer, ready_flag,							\
+							sequence_number)  					  							\
+do {                                                                              			\
+    int j;                                                                        			\
+    for( j = 0; j < n_src; j++) {                                                 			\
+        if(src_list[j] != -1) {                                                   			\
+            data_src_ctl_pointer = data_buffs[src_list[j]].ctl_struct;                      \
+            data_src_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*)				\
+										data_buffs[src_list[j]].payload;                    \
+            if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) {   	    \
+                src = src_list[j];                                                			\
+                matched = 1;                                                      			\
+                src_list_index = j;   																\
+                break;                                                        				\
+            }                                                                     			\
+        }                                                                         			\
+    }                                                                             			\
+} while(0)
+
+#define  SG_LARGE_MSG_NB_PROBE(src_list, n_src, src_list_index, matched,					\
+						    src, ctl_structs, data_src_ctl_pointer,							\
+							ready_flag, sequence_number)  									\
+do {                                                                              			\
+    int j;                                                                        			\
+    for( j = 0; j < n_src; j++) {                                                 			\
+        if(src_list[j] != -1) {                                                   			\
+            data_src_ctl_pointer = ctl_structs[src_list[j]];		                        \
+            if( IS_SG_DATA_READY(data_src_ctl_pointer,ready_flag,sequence_number)) {   	    \
+                src = src_list[j];                                                			\
+                matched = 1;                                                      			\
+                src_list_index = j;   														\
+                break;                                                        				\
+            }                                                                     			\
+        }                                                                         			\
+    }                                                                             			\
+} while(0)
+
+
+
+
+
+static inline  __opal_attribute_always_inline__
+int wait_for_peers(int my_rank, int npeers, volatile mca_bcol_basesmuma_payload_t *data_buffs,
+				int flag_value, int sn)
+{
+	int *peers_list = NULL;
+	int counter = 0, diter = 0;
+	volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer = NULL;
+
+	peers_list = (int *)malloc(sizeof(int) * npeers);
+
+	for (diter = 0; diter < npeers; diter++ ){
+		peers_list[diter] = my_rank ^ (1<<diter);
+		assert(peers_list[diter] != -1);
+	}
+
+	counter = 0;
+	while (counter < npeers) {
+		for (diter = 0; diter < npeers; diter++){
+			if (-1 != peers_list[diter]) {
+				peer_ctl_pointer = data_buffs[peers_list[diter]].ctl_struct;
+
+				if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
+					counter++;
+					peers_list[diter] = -1;
+				}
+			}
+		}
+		opal_progress();
+	}
+
+	return 0;
+}
+
+static inline  __opal_attribute_always_inline__
+int wait_for_peers_nb(int my_rank, int npeers,
+				volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs,
+				volatile int flag_value, int sn)
+{
+	int *peers_list = NULL;
+	int counter = 0, diter = 0;
+	volatile mca_bcol_basesmuma_ctl_struct_t *peer_ctl_pointer = NULL;
+
+	peers_list = (int *)malloc(sizeof(int) * npeers);
+
+	for (diter = 0; diter < npeers; diter++ ){
+		peers_list[diter] = my_rank ^ (1<<diter);
+		assert(peers_list[diter] != -1);
+	}
+
+	counter = 0;
+	while (counter < npeers) {
+		for (diter = 0; diter < npeers; diter++){
+			if (-1 != peers_list[diter]) {
+				peer_ctl_pointer = ctl_structs[peers_list[diter]];
+
+				if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
+					counter++;
+					peers_list[diter] = -1;
+				}
+			}
+		}
+		opal_progress();
+	}
+
+	return 0;
+}
+
+static inline  __opal_attribute_always_inline__
+int wait_for_post_complete_nb(int my_rank, int npeers,
+				volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs,
+				int flag_value, int sn)
+{
+	/* int *peers_list = NULL; */
+	int peers_list[MAX_SM_GROUP_SIZE];
+	int counter = 0, diter = 0;
+	volatile mca_bcol_basesmuma_ctl_struct_t *peer_ctl_pointer = NULL;
+
+/*	peers_list = (int *)malloc(sizeof(int) * npeers); */
+
+	assert(npeers < MAX_SM_GROUP_SIZE);
+
+	for (diter = 0; diter < npeers; diter++ ){
+		peers_list[diter] = my_rank ^ (1<<diter);
+		assert(peers_list[diter] != -1);
+	}
+
+	counter = 0;
+	for (diter = 0; diter < npeers; diter++){
+		peer_ctl_pointer = ctl_structs[peers_list[diter]];
+
+		if (IS_SG_DATA_READY(peer_ctl_pointer, flag_value, sn)) {
+					counter++;
+		}
+	}
+
+/*	free(peers_list); */
+	return counter;
+}
+
+static inline  __opal_attribute_always_inline__
+int  sg_large_msg_probe(sg_state_t *sg_state)
+{
+	int j,n_src = sg_state->pow_2_levels+1;
+
+
+	for( j = 0; j < n_src; j++) {
+        if(sg_state->src_list[j] != -1) {
+			sg_state->parent_ctl_pointer = sg_state->ctl_structs[sg_state->src_list[j]];
+
+			BASESMUMA_VERBOSE(5,("Parent %d ctl pointer (parent=%x, my ctl=%x) flag %d",
+								sg_state->src_list[j],sg_state->parent_ctl_pointer,
+								sg_state->my_ctl_pointer,
+								sg_state->parent_ctl_pointer->flag));
+
+			if (IS_SG_DATA_READY(sg_state->parent_ctl_pointer,
+						sg_state->ready_flag, sg_state->sequence_number)) {
+                sg_state->src = sg_state->src_list[j];
+                sg_state->matched = 1;
+                sg_state->src_list_index = j;
+				break;
+            }
+        }
+    }
+
+	return 0;
+}
+/*
+ * I will post message for all the my children
+ */
+static inline  __opal_attribute_always_inline__
+int sm_portals_root_scatter(sg_state_t *sg_state)
+{
+	int extra_src_posts = -1, scatter_posts = -1, allgather_posts = -1,
+						total_msg_posts = -1;
+
+	BASESMUMA_VERBOSE(10,("I am the root of the data"));
+    sg_state->my_ctl_pointer->offset = 0;
+    sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels;
+    sg_state->my_ctl_pointer->length = sg_state->fragment_size;
+
+
+
+	extra_src_posts = (sg_state->my_rank + sg_state->pow_2 < sg_state->group_size ) ? 1: 0;
+	scatter_posts = sg_state->my_ctl_pointer->n_sends;
+	allgather_posts = sg_state->pow_2_levels - 1;
+
+	total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+
+	if ( total_msg_posts <= 0) {
+		BASESMUMA_VERBOSE(10,("No need to post the data "));
+		return OMPI_SUCCESS;
+	}
+
+	mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
+						 &sg_state->my_ctl_pointer->portals_buf_addr,
+						   sg_state->my_userbuf, sg_state->fragment_size,
+						   PTL_EQ_NONE,
+						   total_msg_posts,
+						   blocked_post,
+						  PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE |
+						  PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+
+	/*
+	 mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
+						 &sg_state->my_ctl_pointer->portals_buf_addr,
+						   sg_state->my_userbuf, sg_state->fragment_size,
+						   sg_state->allgather_eq_h,
+						   total_msg_posts,
+						   blocked_post,
+						  PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE |
+						  PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+	 */
+
+	 sg_state->msg_posted = true ;
+
+	/*
+	MB();
+	*/
+	sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+	return OMPI_SUCCESS;
+}
+
+/*
+ * Im root but my rank > pow2_groupsize, so will copy to partner who
+ * will act as root (secondary)
+ */
+static inline  __opal_attribute_always_inline__
+int sm_portals_extra_root_scatter(sg_state_t *sg_state)
+{
+	int scatter_partner = -1;
+	volatile mca_bcol_basesmuma_ctl_struct_t *scatter_partner_ctl_pointer = NULL;
+
+	int	total_msg_posts  = 1;
+
+	if ( total_msg_posts <= 0) {
+		BASESMUMA_VERBOSE(10,("No need to post the data "));
+	}
+	else {
+		mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
+						 &sg_state->my_ctl_pointer->portals_buf_addr,
+						   sg_state->my_userbuf, sg_state->fragment_size,
+						   PTL_EQ_NONE,
+						   total_msg_posts,
+						   blocked_post,
+						  PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET
+						  | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+	sg_state->msg_posted = true ;
+
+	}
+
+	MB();
+	sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+
+
+	scatter_partner = sg_state->my_rank - sg_state->pow_2;
+	scatter_partner_ctl_pointer =
+					sg_state->ctl_structs[scatter_partner];
+
+	while(!IS_SG_DATA_READY(scatter_partner_ctl_pointer, sg_state->ready_flag,
+									sg_state->sequence_number)){
+					opal_progress();
+	}
+
+	return OMPI_SUCCESS;
+}
+
+/*
+ * Gets msg from the partner (> pow2_groupsize) and posts the
+ * message acting as root
+ */
+static inline  __opal_attribute_always_inline__
+int sm_portals_secondary_root_scatter(sg_state_t *sg_state)
+{
+
+	volatile mca_bcol_basesmuma_ctl_struct_t *extra_src_ctl_pointer = NULL;
+	int scatter_posts, allgather_posts, extra_src_posts, total_msg_posts;
+
+	sg_state->secondary_root = true;
+    BASESMUMA_VERBOSE(10,("I am the secondary root for the data"));
+    sg_state->my_ctl_pointer->offset = 0;
+    sg_state->my_ctl_pointer->n_sends = sg_state->pow_2_levels;
+    sg_state->my_ctl_pointer->length = sg_state->fragment_size;
+
+	extra_src_ctl_pointer = sg_state->ctl_structs[sg_state->src];
+
+	mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
+						sg_state->read_eq,
+						&sg_state->my_ctl_pointer->portals_buf_addr,
+						&extra_src_ctl_pointer->portals_buf_addr, 0,
+						0, sg_state->fragment_size);
+
+
+	extra_src_posts = 0;
+	scatter_posts = sg_state->my_ctl_pointer->n_sends;
+	allgather_posts = sg_state->pow_2_levels - 1;
+
+	total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+
+	if (total_msg_posts > 0) {
+		mca_bcol_basesmuma_portals_post_msg(sg_state->cs,
+						  &sg_state->my_ctl_pointer->portals_buf_addr,
+						   sg_state->my_userbuf, sg_state->fragment_size,
+						   PTL_EQ_NONE,
+						   total_msg_posts,
+						   blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE | PTL_MD_OP_GET
+						   | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+		sg_state->msg_posted = true ;
+	}
+    MB();
+    sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+	return OMPI_SUCCESS;
+}
+
+/*
+ * Internode Scatter: Get data from my parent and post for my children
+ */
+
+static inline  __opal_attribute_always_inline__
+int sm_portals_internode_scatter(sg_state_t *sg_state)
+{
+
+	int scatter_posts, allgather_posts, extra_src_posts,
+					total_msg_posts;
+	uint64_t local_offset, remote_offset;
+
+	/* compute the size of the chunk to copy */
+	sg_state->length = (sg_state->parent_ctl_pointer->length)/
+       (1<<(sg_state->parent_ctl_pointer->n_sends - sg_state->my_ctl_pointer->n_sends));
+	sg_state->my_ctl_pointer->length = sg_state->length;
+	sg_state->my_ctl_pointer->offset =
+				sg_state->parent_ctl_pointer->offset + sg_state->length;
+
+
+	local_offset = sg_state->my_ctl_pointer->offset;
+	remote_offset = sg_state->parent_ctl_pointer->offset +
+						sg_state->length;
+
+	mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
+								sg_state->read_eq,
+								&sg_state->my_ctl_pointer->portals_buf_addr,
+								&sg_state->parent_ctl_pointer->portals_buf_addr,local_offset,
+								remote_offset,sg_state->length);
+
+	/* Now post the message for other children to read */
+	extra_src_posts = (sg_state->my_rank + sg_state->pow_2 <
+								sg_state->group_size ) ? 1: 0;
+	scatter_posts = sg_state->my_ctl_pointer->n_sends;
+	allgather_posts = sg_state->pow_2_levels - 1;
+
+	total_msg_posts = scatter_posts + allgather_posts + extra_src_posts ;
+
+	if (total_msg_posts > 0) {
+		mca_bcol_basesmuma_portals_post_msg(sg_state->cs, &sg_state->my_ctl_pointer->portals_buf_addr,
+						   sg_state->my_userbuf, sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length,
+						   PTL_EQ_NONE,
+						   total_msg_posts,
+						   blocked_post,
+						   PTL_MD_EVENT_START_DISABLE| PTL_MD_EVENT_END_DISABLE
+						   | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | PTL_MD_EVENT_AUTO_UNLINK_ENABLE);
+
+		sg_state->msg_posted = true;
+	}
+	/*
+    MB();
+	 */
+    sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+	return OMPI_SUCCESS;
+}
+
+/*
+ * Bcast's Allgather Phase:
+ * Combines data from all processes using recursive doubling algorithm
+ */
+static inline  __opal_attribute_always_inline__
+int sm_portals_bcasts_allgather_phase(sg_state_t *sg_state)
+{
+	int ag_loop,  partner;
+	volatile mca_bcol_basesmuma_ctl_struct_t  *partner_ctl_pointer = NULL; /* recursive double */
+
+
+	for( ag_loop = 1; ag_loop < sg_state->pow_2_levels; ag_loop++) {
+	        /* get my partner for this level */
+        partner = sg_state->my_rank^(1<<ag_loop);
+        partner_ctl_pointer = sg_state->ctl_structs[partner];
+
+
+		/* Block until partner is at this level of recursive-doubling stage */
+        while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag,
+								sg_state->sequence_number)) {
+            opal_progress();
+        }
+        assert(partner_ctl_pointer->flag >= sg_state->ready_flag);
+
+		if (partner_ctl_pointer->offset < sg_state->my_ctl_pointer->offset) {
+			sg_state->global_sg_offset -= sg_state->length;
+			sg_state->local_sg_offset = sg_state->global_sg_offset;
+		} else {
+			sg_state->local_sg_offset = sg_state->global_sg_offset + sg_state->length;
+		}
+
+
+		BASESMUMA_VERBOSE(10,("Allgather Phase: Get message from process %d, length %d",
+								partner, sg_state->length));
+		mca_bcol_basesmuma_portals_get_msg_fragment(sg_state->cs,
+								sg_state->read_eq,
+								&sg_state->my_ctl_pointer->portals_buf_addr,
+								&partner_ctl_pointer->portals_buf_addr,sg_state->local_sg_offset,
+								sg_state->local_sg_offset, sg_state->length);
+
+		sg_state->ready_flag++;
+		MB();
+	sg_state->my_ctl_pointer->flag = sg_state->ready_flag;
+
+		/* Block until partner is at this level of recursive-doubling stage */
+	while(!IS_SG_DATA_READY(partner_ctl_pointer, sg_state->ready_flag,
+								sg_state->sequence_number)) {
+		    opal_progress();
+        }
+
+        /* double the length */
+        sg_state->length *= 2;
+    }
+
+	return OMPI_SUCCESS;
+
+}
+
+
+static inline  __opal_attribute_always_inline__
+int init_sm_group_info(sg_state_t *sg_state, int buff_idx)
+{
+	int idx, leading_dim;
+	int first_instance=0;
+    int flag_offset;
+
+	/* Get addresing information */
+    sg_state->group_size = sg_state->bcol_module->colls_no_user_data.size_of_group;
+    leading_dim = sg_state->bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+	BASESMUMA_VERBOSE(1,("My buffer idx %d group size %d, leading dim %d, idx %d",
+							buff_idx,sg_state->group_size,leading_dim,idx));
+    /* grab the ctl buffs */
+    sg_state->ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
+        sg_state->bcol_module->colls_with_user_data.ctl_buffs+idx;
+
+	sg_state->my_rank = sg_state->bcol_module->super.sbgp_partner_module->my_index;
+    sg_state->my_ctl_pointer = sg_state->ctl_structs[sg_state->my_rank];
+
+	if (sg_state->my_ctl_pointer->sequence_number < sg_state->sequence_number) {
+        first_instance = 1;
+    }
+
+    if(first_instance) {
+        sg_state->my_ctl_pointer->flag = -1;
+        sg_state->my_ctl_pointer->index = 1;
+
+        sg_state->my_ctl_pointer->starting_flag_value = 0;
+        flag_offset = 0;
+
+    } else {
+        sg_state->my_ctl_pointer->index++;
+    }
+
+	/* For bcast we shud have only entry to this bcol
+	assert(sg_state->my_ctl_pointer->flag == -1);
+	*/
+
+	/* increment the starting flag by one and return */
+    flag_offset = sg_state->my_ctl_pointer->starting_flag_value;
+    sg_state->ready_flag = flag_offset + sg_state->sequence_number + 1;
+
+    sg_state->my_ctl_pointer->sequence_number = sg_state->sequence_number;
+
+	return OMPI_SUCCESS;
+
+}
+
+static inline  __opal_attribute_always_inline__
+int init_sm_portals_sg_info(sg_state_t *sg_state)
+{
+/* Get portals info*/
+	mca_bcol_basesmuma_portal_proc_info_t *portals_info;
+	int rc = OMPI_SUCCESS;
+	int sg_matchbits;
+
+	portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)sg_state->cs->portals_info;
+
+	sg_matchbits = sg_state->sequence_number ;
+
+	/* Construct my portal buffer address and copy to payload buffer */
+	mca_bcol_basesmuma_construct_portal_address(&sg_state->my_ctl_pointer->portals_buf_addr,
+						portals_info->portal_id.nid,
+						portals_info->portal_id.pid,
+						sg_matchbits,
+						sg_state->bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
+
+	sg_state->my_ctl_pointer->portals_buf_addr.userbuf = sg_state->my_userbuf;
+	sg_state->my_ctl_pointer->portals_buf_addr.userbuf_length = sg_state->fragment_size;
+
+	return OMPI_SUCCESS;
+}
+
+static inline  __opal_attribute_always_inline__
+int compute_src_from_root(int group_root, int my_group_rank, int pow2, int
+				group_size)
+{
+
+	int root, relative_rank, src, i;
+
+	if (group_root < pow2) {
+        root = group_root;
+    } else {
+        /* the source of the data is extra node,
+           the real root it represented by some rank from
+           pow2 group */
+        root = group_root - pow2;
+        /* shortcut for the case when my rank is root for the group */
+        if (my_group_rank == root) {
+            return group_root;
+        }
+    }
+
+    relative_rank = (my_group_rank - root) < 0 ? my_group_rank - root + pow2 :
+                                           my_group_rank - root;
+
+    for (i = 1; i < pow2; i<<=1) {
+        if (relative_rank & i) {
+            src = my_group_rank ^ i;
+            if (src >= pow2)
+                src -= pow2;
+
+            return src;
+        }
+    }
+
+	return -1;
+}
+
+int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args);
+
+int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args);
+
+int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args);
+
+#endif
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_knomial_bcast.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_knomial_bcast.c
new file mode 100644
index 0000000000..af9315bc39
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_lmsg_knomial_bcast.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+/* #define __PORTALS_AVAIL__ */
+#ifdef __PORTALS_AVAIL__
+
+#define __PORTALS_ENABLE__
+#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+#include "bcol_basesmuma_utils.h"
+
+#include "bcol_basesmuma_portals.h"
+
+/* debug */
+#include <unistd.h>
+/* end debug */
+
+
+/**
+ * Shared memory non-blocking Broadcast - K-nomial fan-out for small data buffers.
+ * This routine assumes that buf (the input buffer) is a single writer
+ * multi reader (SWMR) shared memory buffer owned by the calling rank
+ * which is the only rank that can write to this buffers.
+ * It is also assumed that the buffers are registered and fragmented
+ * at the ML level and that buf is sufficiently large to hold the data.
+ *
+ *
+ * @param buf - SWMR shared buffer within a sbgp that the
+ * executing rank can write to.
+ * @param count - the number of elements in the shared buffer.
+ * @param dtype - the datatype of a shared buffer element.
+ * @param root - the index within the sbgp of the root.
+ * @param module - basesmuma module.
+ */
+int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args)
+{
+#if 0
+		/* local variables */
+    mca_bcol_basesmuma_module_t* bcol_module=
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    int i, matched = 0;
+    int src=-1;
+    int group_size;
+    int my_rank, first_instance=0, flag_offset;
+    int rc = OMPI_SUCCESS;
+    int leading_dim, buff_idx, idx;
+    int count=input_args->count;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+    int64_t sequence_number=input_args->sequence_num;
+
+	volatile int64_t ready_flag;
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char* parent_data_pointer;
+    volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    void *userbuf = (void *)((unsigned char *)input_args->userbuf);
+
+    size_t pack_len = 0, dt_size;
+
+    struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL;
+    struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL;
+	mca_bcol_basesmuma_portal_proc_info_t *portals_info;
+	portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
+
+    /* we will work only on packed data - so compute the length*/
+    ompi_datatype_type_size(dtype, &dt_size);
+    pack_len=count*dt_size;
+    buff_idx = input_args->src_desc->buffer_index;
+
+    /* Get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+	my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload;
+
+    /* setup resource recycling */
+    if( my_ctl_pointer->sequence_number < sequence_number ) {
+        first_instance=1;
+    }
+
+	if( first_instance ) {
+        /* Signal arrival */
+        my_ctl_pointer->flag = -1;
+        my_ctl_pointer->index=1;
+        /* this does not need to use any flag values , so only need to
+         * set the value for subsequent values that may need this */
+        my_ctl_pointer->starting_flag_value=0;
+        flag_offset=0;
+
+    } else {
+        /* only one thread at a time will be making progress on this
+         *   collective, so no need to make this atomic */
+        my_ctl_pointer->index++;
+    }
+
+
+    /* increment the starting flag by one and return */
+    flag_offset = my_ctl_pointer->starting_flag_value;
+    ready_flag = flag_offset + sequence_number + 1;
+    my_ctl_pointer->sequence_number = sequence_number;
+
+
+	/* Construct my portal buffer address and copy to payload buffer */
+	mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer,
+						portals_info->portal_id.nid,
+						portals_info->portal_id.pid,
+						sequence_number,
+						bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
+
+    /* non-blocking broadcast algorithm */
+
+    /* If I am the root, then signal ready flag */
+    if(input_args->root_flag) {
+		ptl_handle_eq_t eq_h;
+		ptl_event_t event;
+		int ret;
+
+        BASESMUMA_VERBOSE(10,("I am the root of the data"));
+
+		/* create an event queue for the incoming buffer */
+	ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h);
+
+		if (ret != PTL_OK) {
+	  fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret);
+			return OMPI_ERR_OUT_OF_RESOURCE;
+	}
+
+		/* Post the message using portal copy */
+
+		 mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
+							pack_len, eq_h, my_lmsg_ctl_pointer->nsends);
+
+		/*
+         * signal ready flag
+         */
+        my_ctl_pointer->flag = ready_flag;
+
+		/* wait for a response from the client */
+	mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
+					&event, my_lmsg_ctl_pointer->nsends);
+
+		/* free the event queue */
+		ret = PtlEQFree(eq_h);
+		if (ret != PTL_OK) {
+		    fprintf(stderr, "PtlEQFree() failed: %d )\n",ret);
+		}
+
+        /* root is finished */
+        goto Release;
+    }
+
+    /* If I am not the root, then poll on possible "senders'" control structs */
+    for( i = 0; i < cs->num_to_probe && 0 == matched; i++) {
+
+        /* Shared memory iprobe */
+		/*
+		BCOL_BASESMUMA_SM_PROBE(bcol_module->src, bcol_module->src_size,
+                my_rank, matched, src);
+		*/
+		do {
+			int j, n_src, my_index;
+			n_src = bcol_module->src_size;
+
+			for( j = 0; j < n_src; j++) {
+			parent_ctl_pointer = data_buffs[bcol_module->src[j]].ctl_struct;
+			parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *)
+											data_buffs[bcol_module->src[j]].payload;
+			if (IS_DATA_READY(parent_ctl_pointer,ready_flag,sequence_number)) {
+
+					src = bcol_module->src[j];
+			matched = 1;
+			break;
+			}
+		}
+		} while(0);
+
+    }
+
+    /* If not matched, then hop out and put me on progress list */
+    if(0 == matched ) {
+        BASESMUMA_VERBOSE(10,("Shared memory probe didn't find a match"));
+        return BCOL_FN_NOT_STARTED;
+    }
+
+    /* else, we found our root within the group ... */
+    BASESMUMA_VERBOSE(10,("Shared memory probe was matched, the root is %d", src));
+
+    /* receive the data from sender */
+    /* get the data buff */
+    /* taken care of in the macro */
+    /*parent_data_pointer = data_buffs[src].payload;*/
+    /* copy the data */
+	mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len);
+
+    /* set the memory barrier to ensure completion */
+    MB();
+    /* signal that I am done */
+    my_ctl_pointer->flag = ready_flag;
+
+    /* am I the last one? If so, release buffer */
+
+Release:
+    my_ctl_pointer->starting_flag_value++;
+
+    return BCOL_FN_COMPLETE;
+#endif
+}
+
+#if 0
+
+#define BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index,		\
+		my_group_index, group_size,sm_data_buffs,sender_ready_flag,			\
+				num_pending_sends) 													\
+{																					\
+    int k, rc;																  	    \
+    int dst; 			                                               			    \
+	int comm_dst;																	\
+    volatile mca_bcol_basesmuma_header_t *recv_ctl_pointer = NULL;					\
+	volatile mca_bcol_basesmuma_portal_buf_addr_t  *recv_lmsg_ctl_pointer = NULL;   \
+                                                                                    \
+    num_pending_sends = 0;													        \
+    while(radix_mask > 0) {															\
+        /* For each level of tree, do sends */										\
+        for (k = 1;																	\
+			k < radix && my_relative_index + radix_mask * k < group_size;  	  		\
+			++k) {   	                                              	   			\
+                                                                                    \
+            dst = my_group_index + radix_mask * k;                        		    \
+            if (dst >= group_size) {												\
+                dst -= group_size;													\
+            }                                                                	    \
+			/* Signal the children to get data */									\
+			recv_ctl_pointer	  = data_buffs[dst].ctl;							\
+			recv_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *)		\
+											data_buffs[dst].payload;				\
+			recv_lmsg_ctl_pointer->src_index = my_group_index;						\
+			recv_lmsg_ctl_pointer->flag = sender_ready_flag;						\
+            ++num_pending_sends;													\
+        }																			\
+        radix_mask /= radix;														\
+    }                                                                      			\
+																		    \
+}
+
+
+
+int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
+    coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    mca_bcol_basesmuma_module_t* bcol_module=
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    int i, matched = 0;
+    int src=-1;
+    int group_size;
+    int my_rank, first_instance=0, flag_offset;
+    int rc = OMPI_SUCCESS;
+    int leading_dim, buff_idx, idx;
+    int count=input_args->count;
+    struct ompi_datatype_t* dtype=input_args->dtype;
+    int64_t sequence_number=input_args->sequence_num;
+
+	volatile int64_t ready_flag;
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char* parent_data_pointer;
+    volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    void *userbuf = (void *)((unsigned char *)input_args->userbuf);
+
+    size_t pack_len = 0, dt_size;
+
+    struct mca_bcol_basesmuma_portal_buf_addr_t *my_lmsg_ctl_pointer = NULL;
+    struct mca_bcol_basesmuma_portal_buf_addr_t *parent_lmsg_ctl_pointer = NULL;
+	mca_bcol_basesmuma_portal_proc_info_t *portals_info;
+	portals_info = (mca_bcol_basesmuma_portal_proc_info_t*)cs->portals_info;
+
+    /* we will work only on packed data - so compute the length*/
+    ompi_datatype_type_size(dtype, &dt_size);
+    pack_len=count*dt_size;
+    buff_idx = input_args->src_desc->buffer_index;
+
+    /* Get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    group_size = bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Set pointer to current proc ctrl region */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+	my_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t*) data_buffs[my_rank].payload;
+
+    /* setup resource recycling */
+    if( my_ctl_pointer->sequence_number < sequence_number ) {
+        first_instance=1;
+    }
+
+	if( first_instance ) {
+        /* Signal arrival */
+        my_ctl_pointer->flag = -1;
+        my_ctl_pointer->index=1;
+        /* this does not need to use any flag values , so only need to
+         * set the value for subsequent values that may need this */
+        my_ctl_pointer->starting_flag_value=0;
+        flag_offset=0;
+
+    } else {
+        /* only one thread at a time will be making progress on this
+         *   collective, so no need to make this atomic */
+        my_ctl_pointer->index++;
+    }
+
+
+    /* increment the starting flag by one and return */
+    flag_offset = my_ctl_pointer->starting_flag_value;
+    ready_flag = flag_offset + sequence_number + 1;
+    my_ctl_pointer->sequence_number = sequence_number;
+
+
+	/* Construct my portal buffer address and copy to payload buffer */
+	mca_bcol_basesmuma_construct_portal_address(my_lmsg_ctl_pointer,
+						portals_info->portal_id.nid,
+						portals_info->portal_id.pid,
+						sequence_number,
+						bcol_module->super.sbgp_partner_module->group_comm->c_contextid);
+
+	my_lmsg_ctl_pointer->userbuf = userbuff;
+	my_lsmg_ctl_pointer->userbuf_length = fragment_length;
+	/* create an event queue  */
+	ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q, PTL_EQ_HANDLER_NONE, &eq_h);
+
+    /* non-blocking broadcast algorithm */
+
+    /* If I am the root, then signal ready flag */
+    if(input_args->root_flag) {
+		ptl_handle_eq_t eq_h;
+		ptl_event_t event;
+		int ret;
+		int root_radix_mask = sm_module->pow_knum;
+
+        BASESMUMA_VERBOSE(10,("I am the root of the data"));
+
+
+		if (ret != PTL_OK) {
+	  fprintf(stderr, "PtlEQAlloc() failed: %d \n",ret);
+			return OMPI_ERR_OUT_OF_RESOURCE;
+	}
+
+		BASESMUMA_K_NOMIAL_SEND_SIGNAL(root_radix_mask, radix, 0,
+			my_rank, group_size, data_buffs, ready_flag, nsends) ;
+
+		mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
+							pack_len, eq_h, nsends);
+
+		/* wait for a response from the client */
+	mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
+					&event, nsends);
+
+        /* root is finished */
+        goto Release;
+    }
+
+	/* Im not a root so wait until someone puts data and
+	 * compute where to get data from */
+
+	while (my_ctl_pointer->flag != ready_flag) ;
+
+	my_data_source_index = lmsg_ctl_pointer->src_index;
+
+	parent_lmsg_ctl_pointer = (mca_bcol_basesmuma_portal_buf_addr_t *)
+											data_buffs[my_data_source_index].payload;
+
+	mca_bcol_basesmuma_portals_get_msg(cs, parent_lmsg_ctl_pointer, userbuf, pack_len);
+
+
+
+
+	/* I am done getting data, should I send the data to someone  */
+
+	my_relative_index = (my_rank - my_data_source_index) < 0 ? my_rank -
+		my_data_source_index  + group_size : my_rank - my_data_source_index;
+
+	/*
+     * 2. Locate myself in the tree:
+     * calculate number of radix steps that we should to take
+     */
+    radix_mask = 1;
+    while (radix_mask < group_size) {
+        if (0 != my_relative_index % (radix * radix_mask)) {
+            /* I found my level in tree */
+            break;
+        }
+        radix_mask *= radix;
+    }
+
+	/* go one step back */
+    radix_mask /=radix;
+
+	BASESMUMA_K_NOMIAL_SEND_SIGNAL(radix_mask, radix, my_relative_index,
+		my_rank, group_size,data_buffs,ready_flag,nsends)
+
+	mca_bcol_basesmuma_portals_post_msg_nb_nopers(cs, my_lmsg_ctl_pointer, userbuf,
+							pack_len, eq_h, nsends);
+
+	/* wait for childrens to read */
+    mca_bcol_basesmuma_portals_wait_event_nopers(eq_h, POST_MSG_EVENT,
+					&event, nsends);
+
+
+
+Release:
+	/* free the event queue */
+	ret = PtlEQFree(eq_h);
+	if (ret != PTL_OK) {
+		    fprintf(stderr, "PtlEQFree() failed: %d )\n",ret);
+	}
+
+
+    my_ctl_pointer->starting_flag_value++;
+
+    return BCOL_FN_COMPLETE;
+}
+
+#endif
+#endif
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c
index b0eff310d7..fa9acef388 100644
--- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_module.c
@@ -1,8 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
- * Copyright (c) 2012      Los Alamos National Security, LLC.
- *                         All rights reserved.
+ * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -25,39 +26,473 @@
 #include "opal/util/show_help.h"
 #include "opal/align.h"
 
+#include "ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h"
 #include "bcol_basesmuma.h"
 #include "bcol_basesmuma_utils.h"
 
+#ifdef __PORTALS_AVAIL__
+#include "bcol_basesmuma_portals.h"
+#endif
 
 
 
+#if 0
 
-static void                
+/* debug */
+static mca_bcol_basesmuma_module_t *test_sm_module;
+
+int test_addressing(int seq_num,
+                    mca_bcol_base_module_t *b_module)
+{
+
+    /* local variables */
+    int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange, flag_to_set, i;
+    int pair_rank, flag_offset;
+    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+    netpatterns_pair_exchange_node_t *my_exchange_node;
+    int extra_rank, my_rank, pow_2;
+    mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
+    mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
+    int64_t sequence_number,seq_actual,seq_exp,flag_actual,flag_exp ;
+    bool found;
+    int buff_index, first_instance=0;
+    mca_bcol_basesmuma_module_t *bcol_module=
+        (mca_bcol_basesmuma_module_t *)b_module;
+
+    /* get the pointer to the segment of control structures */
+    my_exchange_node=&(bcol_module->recursive_doubling_tree);
+    my_rank=bcol_module->super.sbgp_partner_module->my_index;
+    pow_2=bcol_module->super.sbgp_partner_module->pow_2;
+
+    /* figure out what instance of the basesmuma bcol I am */
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    sequence_number=seq_num;
+    buff_index=sequence_number & (bcol_module->colls_no_user_data.mask);
+    idx=SM_ARRAY_INDEX(leading_dim,buff_index,0);
+    ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+        bcol_module->colls_no_user_data.ctl_buffs+idx;
+    my_ctl=ctl_structs[my_rank];
+    for(i=0 ; i < leading_dim ; i++ ) {
+        partner_ctl=ctl_structs[i];
+        seq_actual=partner_ctl->sequence_number;
+        flag_actual=partner_ctl->flag;
+        seq_exp=i;
+        flag_exp=sequence_number;
+        if( seq_actual != seq_exp ) {
+            fprintf(stderr," Error sn expected %ld  got %ld \n",
+                    seq_exp,seq_actual);
+            fflush(stderr);
+        }
+        if( flag_actual != flag_exp ) {
+            fprintf(stderr," Error flag expected %ld  got %ld \n",
+                    flag_exp,flag_actual);
+            fflush(stderr);
+        }
+    }
+}
+static void test_resrouce_recycle()
+{
+    int i,j, grp_size, my_rank;
+    int idx,sum;
+    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
+    int num_bufs= cs->basesmuma_num_mem_banks*
+        cs->basesmuma_num_regions_per_bank;
+    int64_t sn=100;
+    bcol_function_args_t fun_args;
+    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+
+    fprintf(stderr," num buffs %d \n",num_bufs);
+    fflush(stderr);
+
+    sm_nbbar_desc_t *sm_desc=
+        &((test_sm_module->colls_no_user_data.ctl_buffs_mgmt[0].nb_barrier_desc));
+
+    grp_size=sm_desc->sm_module->super.sbgp_partner_module->group_size;
+    my_rank=sm_desc->sm_module->super.sbgp_partner_module->my_index;
+
+    /* need to hack the sequence number offset, as this is set after this
+     * routine is called */
+    test_sm_module->super.squence_number_offset=100;
+
+    for( i=0 ; i < num_bufs +2 ; i++ ) {
+        /* get my ctl structure index in this buffer set */
+        idx=SM_ARRAY_INDEX(grp_size,i,0);
+        ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+            test_sm_module->colls_no_user_data.ctl_buffs+idx;
+        ctl_structs[my_rank]->sequence_number=(int64_t)my_rank;
+        ctl_structs[my_rank]->flag=(int64_t)i;
+    }
+
+    /* mark that I am here */
+    idx=SM_ARRAY_INDEX(grp_size,(num_bufs+1),0);
+    ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+        test_sm_module->colls_no_user_data.ctl_buffs+idx;
+    ctl_structs[my_rank]->index=(int64_t)1;
+
+    /* make sure all have arrived */
+
+    sum=0;
+    while(sum < grp_size ) {
+        sum=0;
+        idx=SM_ARRAY_INDEX(grp_size,(num_bufs+1),0);
+        ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+            test_sm_module->colls_no_user_data.ctl_buffs+idx;
+        for( i=0 ; i < grp_size ; i++ ) {
+            sum+=ctl_structs[i]->index;
+        }
+
+    }
+
+    /* check to see what values we see */
+    for( i=0 ; i < num_bufs +2 ; i++ ) {
+        idx=SM_ARRAY_INDEX(grp_size,i,0);
+        ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+            test_sm_module->colls_no_user_data.ctl_buffs+idx;
+        for( j=0 ; j < grp_size ; j++ ) {
+            if( ctl_structs[j]->sequence_number != j ) {
+                fprintf(stderr," Error sn %ld expected %ld i %d j %d my %d \n",
+                        ctl_structs[i]->sequence_number,j, i,j,my_rank);
+                fflush(stderr);
+            }
+            if( ctl_structs[j]->flag != i ) {
+                fprintf(stderr," Error sn %ld expected %ld \n",
+                        ctl_structs[i]->flag,i);
+                fflush(stderr);
+            }
+
+        }
+    }
+
+    for( i=0 ; i < num_bufs ; i++ ) {
+        test_addressing((int64_t)i,test_sm_module);
+    }
+    /*
+      fun_args.n_of_this_type_in_collective=2;
+      for( i=0 ; i < 2*grp_size ; i++ ) {
+
+      if( my_rank == (i%grp_size) ) {
+      sleep(3);
+      }
+
+      fun_args.sequence_num=sn;
+      sn++;
+      bcol_basesmuma_fanin(&fun_args,
+      (mca_bcol_base_module_t *)test_sm_module);
+      bcol_basesmuma_fanout(&fun_args,
+      (mca_bcol_base_module_t *)test_sm_module);
+
+      if( my_rank == (i%grp_size) ) {
+      fprintf(stderr," hi from %d \n",my_rank);
+      fflush(stderr);
+      }
+
+      }
+      for( i=0 ; i < 2*grp_size ; i++ ) {
+
+      if( my_rank == (i%grp_size) ) {
+      sleep(3);
+      }
+      bcol_basesmuma_rd_nb_barrier_init_admin(sm_desc);
+      while( sm_desc->collective_phase != NB_BARRIER_DONE ) {
+      bcol_basesmuma_rd_nb_barrier_progress_admin(sm_desc);
+      }
+      sm_desc->sm_module->colls_no_user_data.ctl_buffs_mgmt[0].bank_gen_counter++;
+      if( my_rank == (i%grp_size) ) {
+      fprintf(stderr," hi from %d \n",my_rank);
+      fflush(stderr);
+      }
+
+      }
+    */
+
+
+    /*
+      for( i = 0 ; i < num_bufs+1 ; i++ ) {
+      indx=bcol_basesmuma_get_buff_index(&(test_sm_module->colls_no_user_data),
+      (uint64_t)i);
+      fprintf(stderr," indx %d \n ",indx);
+      fflush(stderr);
+      opal_progress();
+      if( -1 != indx ) {
+      ret=bcol_basesmuma_free_buff(&(test_sm_module->colls_no_user_data),
+      (uint64_t)i);
+      if( OMPI_SUCCESS != ret ) {
+      fprintf(stderr," Error when returning %d \n",i);
+      fflush(stderr);
+      }
+      }
+      }
+    */
+
+}
+/* end debug */
+#endif
+
+#if 0  /* this routine (with appropriate bug fixes) is correct when
+        * the subgroup is a single socket.  However, it is not correct
+        * if used over several sockets - the data is correct, but dermining
+        * the tree radix based on that is not.
+        */
+/*
+ * Local function
+ * Get the maximal number of processors sharing the
+ * same socket within a sub-group.
+ * @param my_rank - the rank of the executing process.
+ * @param group_comm - pointer to the sbgp's communicator.
+ * @return - maximal number of procs sharing a socket.
+ */
+static int get_num_of_ranks_sharing_socket(int my_rank, struct ompi_communicator_t *group_comm)
+{
+    int i;
+    int rc;
+    int ret;
+    int proc;
+    int local;
+    bool bound;
+    int cnt = 0;
+    int socket_tmp;
+    int group_size;
+
+    int32_t pcnt = 1;
+    int num_sockets;
+    opal_list_t peers;
+    int n_local_peers;
+    int num_processors;
+    int core_index = -1;
+    int my_socket_index = -1;
+    int* num_proc_per_socket;
+
+    orte_namelist_t *peer;
+    struct ompi_proc_t** procs = NULL;
+    opal_paffinity_base_cpu_set_t my_cpu_set;
+    opal_buffer_t* sbuffer = OBJ_NEW(opal_buffer_t);
+    opal_buffer_t* rbuffer = OBJ_NEW(opal_buffer_t);
+
+
+    /* Set proc info */
+    procs = ompi_comm_local_group_procs(group_comm);
+    group_size = ompi_comm_size(group_comm);
+
+    /* Get the number of processors on this node */
+    ret = opal_paffinity_base_get_processor_info(&num_processors);
+    if((OMPI_SUCCESS != ret) || (OMPI_SUCCESS != rc)){
+        return 0;
+    }
+
+
+    /* Get process affinity mask */
+    OPAL_PAFFINITY_CPU_ZERO(my_cpu_set);
+    ret = opal_paffinity_base_get(&my_cpu_set);
+    OPAL_PAFFINITY_PROCESS_IS_BOUND(my_cpu_set, &bound);
+    if(!bound) {
+        return 0;
+    }else{
+        /* figure out how many sockets are in this system */
+        rc = opal_paffinity_base_get_socket_info(&num_sockets);
+        if((OMPI_SUCCESS != ret) || (OMPI_SUCCESS != rc)){
+            return 0;
+        }
+
+        /* No sockets*/
+        if(num_sockets == 0){
+            return 0;
+        }
+
+        /* All subgroup members share the same socket on the node */
+        if(num_sockets == 1){
+            return group_size;
+        }
+
+        /* Allocate a buffer that counts the number of sockets on a node */
+        num_proc_per_socket = (int*)calloc(num_sockets, sizeof(int));
+        if(NULL == num_proc_per_socket){
+            return 0;
+        }
+        /* Loop over processors on this node */
+        /*  for (proc = 0 ; proc < num_processors ; proc++) {
+            if (OPAL_PAFFINITY_CPU_ISSET(proc, my_cpu_set)) {
+            ret = opal_paffinity_base_get_map_to_socket_core(proc,
+            &socket_tmp, &core_index);
+
+            if(my_socket_index != socket_tmp) {
+            my_socket_index = socket_tmp;
+            break;
+            }
+            }
+            }*/
+    }
+
+
+    /*
+     * The subgroup devided according to node hierarchy,
+     * therefore several sockets might be shared by the groups procs.
+     */
+
+    /* Construct peers list */
+    n_local_peers = 0;
+    OBJ_CONSTRUCT(&peers, opal_list_t);
+    for(proc = 0; proc < group_size; proc++){
+        local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags);
+        if (local) {
+            peer = OBJ_NEW(orte_namelist_t);
+            peer->name.jobid = group_comm->c_local_group->grp_proc_pointers[proc]->proc_name.jobid;
+            peer->name.vpid = group_comm->c_local_group->grp_proc_pointers[proc]->proc_name.vpid;
+            opal_list_append(&peers, &peer->item);
+            n_local_peers++;
+        }
+    }
+
+
+    /*Pack socket index */
+    ret = opal_dss.pack(sbuffer, &my_socket_index, 1, OPAL_INT32);
+    if (ORTE_SUCCESS != ret){
+        fprintf(stderr, " pack returned error %d for my_socket_index \n",ret);
+        fflush(stderr);
+        return -1; /*ret;*/
+    }
+
+    /*Allgather data over the communicator */
+    if (ORTE_SUCCESS != (ret = orte_grpcomm.allgather_list(&peers, sbuffer, rbuffer))) {
+        fprintf(stderr," orte_grpcomm.allgather_list returned error %d \n", ret);
+        fflush(stderr);
+        return -1;
+    }
+
+    /* Is every socket being shared by the same number of procs?
+     * If not we should go over each proc index and find maximal socket */
+
+    /* TODO: Need to find maximal number of procs
+     * sharing the same socket on the node in case
+     * there is more than one socket on the node
+     * and that the sbgp type is according to node
+     * and not socket */
+    /*Unpack the data and find fellow socketeers*/
+    cnt = 0;
+    for (proc = 0; proc < n_local_peers; proc++) {
+
+        int rem_socket_index;
+
+        /*Unpack socket_index*/
+        ret = opal_dss.unpack(rbuffer, &rem_socket_index, &pcnt, OPAL_INT32);
+        if (ORTE_SUCCESS != ret) {
+            fprintf(stderr," Unpack returned error %d for rem_socket_index\n", ret);
+            fflush(stderr);
+            return -1;
+        }
+
+        /*Populate the list*/
+        /*    if (rem_socket_index == my_socket_index) {
+              cnt++;
+              }*/
+        num_proc_per_socket[rem_socket_index]++;
+    }
+
+    /* Find the maximal number of procs sharing the same socket */
+    for(i = 0; i < num_sockets; i++){
+        if(cnt < num_proc_per_socket[i]){
+            cnt = num_proc_per_socket[i];
+        }
+    }
+
+
+    /*Free resources*/
+    OBJ_DESTRUCT(&peers);
+    OBJ_RELEASE(sbuffer);
+    OBJ_RELEASE(rbuffer);
+    free(num_proc_per_socket);
+
+    return cnt;
+}
+#endif
+
+#if 0
+/*
+ * Local functions
+ */
+static int basesmuma_module_enable(mca_bcol_base_module_t *module,
+                                   struct ompi_communicator_t *comm);
+#endif
+
+/*
+ * Local functions
+ */
+static int alloc_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module)
+{
+    int rc = OMPI_SUCCESS, i = 0;
+    netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree;
+    int n_exchanges = k_node->n_exchanges;
+
+    /* Precalculate the allreduce offsets */
+    if (0 < k_node->n_exchanges) {
+        sm_module->reduce_offsets = (int **)malloc(n_exchanges * sizeof(int*));
+
+        if (!sm_module->reduce_offsets) {
+            rc = OMPI_ERROR;
+            return rc;
+        }
+
+        for (i=0; i < n_exchanges ; i++) {
+            sm_module->reduce_offsets[i] = (int *)malloc (sizeof(int) * NOFFSETS);
+
+            if (!sm_module->reduce_offsets[i]){
+                rc = OMPI_ERROR;
+                return rc;
+            }
+        }
+    }
+    return rc;
+}
+
+static int free_lmsg_reduce_offsets_array(mca_bcol_basesmuma_module_t *sm_module)
+{
+    int rc = OMPI_SUCCESS, i = 0;
+    netpatterns_k_exchange_node_t *k_node = &sm_module->knomial_exchange_tree;
+    int n_exchanges = k_node->n_exchanges;
+
+    if (sm_module->reduce_offsets) {
+        for (i=0; i < n_exchanges; i++) {
+            free (sm_module->reduce_offsets[i]);
+        }
+
+        free(sm_module->reduce_offsets);
+    }
+    return rc;
+}
+
+static void
 mca_bcol_basesmuma_module_construct(mca_bcol_basesmuma_module_t *module)
 {
     module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_basesmuma_component;
     module->super.list_n_connected = NULL;
     module->super.hier_scather_offset = 0;
-	
+
 }
 
-static void                
+static void
 mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
 {
     /* local variables */
     int i;
     mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
 
-    /* 
-     * release allocated resrouces 
+    /*
+     * release allocated resrouces
      */
 
-     /* ...but not until you're sure you have no outstanding collectives */
-     while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) {
-         opal_progress();
-     }
+    /* ...but not until you're sure you have no outstanding collectives */
+    while(0 != opal_list_get_size(&(cs->nb_admin_barriers))) {
+        opal_progress();
+    }
 
-    
+#ifdef __PORTALS_AVAIL__
+    /* Remove portals bcast specific resources */
+    if ( PTL_OK != PtlEQFree(sm_module->sg_state.read_eq)) {
+        BASESMUMA_VERBOSE(10,("PtlEQFree() failed:  )"));
+    }
+#endif
+
+    /* Remove Lmsg Reduce Offsets Array */
+    free_lmsg_reduce_offsets_array(sm_module);
 
 
     /* collective topology data */
@@ -71,11 +506,11 @@ mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
         free(sm_module->fanout_read_tree);
         sm_module->fanout_read_tree=NULL;
     }
- 
-	/* gvm Leak FIX Reduction_tree[].children_ranks has 
-	 * to be removed. I don't how to get the size (which is 
-	 * size of subgroup) of array reduction_tree
-	 */
+
+    /* gvm Leak FIX Reduction_tree[].children_ranks has
+     * to be removed. I don't how to get the size (which is
+     * size of subgroup) of array reduction_tree
+     */
     if( sm_module->reduction_tree) {
         for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
             if(0 < sm_module->reduction_tree[i].n_children ) {
@@ -87,16 +522,16 @@ mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
         sm_module->reduction_tree=NULL;
     }
 
-	/* gvm Leak FIX */
-	if (sm_module->fanout_node.children_ranks){
-		free(sm_module->fanout_node.children_ranks);
-		sm_module->fanout_node.children_ranks = NULL;
-	}
-	
-	if (sm_module->fanin_node.children_ranks){
-		free(sm_module->fanin_node.children_ranks);
-		sm_module->fanin_node.children_ranks = NULL;
-	}
+    /* gvm Leak FIX */
+    if (sm_module->fanout_node.children_ranks){
+        free(sm_module->fanout_node.children_ranks);
+        sm_module->fanout_node.children_ranks = NULL;
+    }
+
+    if (sm_module->fanin_node.children_ranks){
+        free(sm_module->fanin_node.children_ranks);
+        sm_module->fanin_node.children_ranks = NULL;
+    }
 
     /* colls_no_user_data resrouces */
     if(sm_module->colls_no_user_data.ctl_buffs_mgmt){
@@ -109,7 +544,14 @@ mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
     }
 
     /* colls_with_user_data resrouces */
-     
+    /*
+     *debug print */
+    /*
+      fprintf(stderr,"AAA colls_with_user_data.ctl_buffs %p \n",
+      sm_module->colls_with_user_data.ctl_buffs_mgmt);
+      fflush(stderr);
+      end debug */
+
     if(sm_module->colls_with_user_data.ctl_buffs_mgmt){
         free(sm_module->colls_with_user_data.ctl_buffs_mgmt);
         sm_module->colls_with_user_data.ctl_buffs_mgmt=NULL;
@@ -123,8 +565,8 @@ mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
         free(sm_module->shared_memory_scratch_space);
         sm_module->shared_memory_scratch_space=NULL;
     }
-    
-#if 1 
+
+#if 1
     if(sm_module->scatter_kary_tree) {
         for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {
             if(0 < sm_module->scatter_kary_tree[i].n_children) {
@@ -150,44 +592,83 @@ mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
 
 static void bcol_basesmuma_set_small_msg_thresholds(struct mca_bcol_base_module_t *super)
 {
-	mca_bcol_basesmuma_module_t *basesmuma_module = 
-                      (mca_bcol_basesmuma_module_t *) super;
+    mca_bcol_basesmuma_module_t *basesmuma_module =
+        (mca_bcol_basesmuma_module_t *) super;
 
     size_t basesmuma_offset = bcol_basesmuma_data_offset_calc(basesmuma_module);
 
     /* Set the Allreduce threshold, for Basesmuma it equals to ML buffer size - data offset */
     super->small_message_thresholds[BCOL_ALLREDUCE] =
-                basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
+        basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
 
     /* Set the Bcast threshold, for Basesmuma it equals to ML buffer size - data offset */
     super->small_message_thresholds[BCOL_BCAST] =
-                basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
+        basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
 
     /* Set the Gather threshold, for Basesmuma it equals to ML buffer size - data offset */
     super->small_message_thresholds[BCOL_GATHER] =
-                (basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) / 
-                                 ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
+        (basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
+        ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
 
     /* Set the ALLgather threshold, for Basesmuma it equals to ML buffer size - data offset */
     super->small_message_thresholds[BCOL_ALLGATHER] =
-                (basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) / 
-                                 ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
+        (basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset) /
+        ompi_comm_size(basesmuma_module->super.sbgp_partner_module->group_comm);
 
     /* Set the Reduce threshold, for Basesmuma it equals to ML buffer size - data offset */
     super->small_message_thresholds[BCOL_REDUCE] =
-                basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
+        basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
 
     /* Set the Scatter threshold, for Basesmuma it equals to ML buffer size - data offset */
     super->small_message_thresholds[BCOL_SCATTER] =
-                basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
+        basesmuma_module->ml_mem.ml_mem_desc->size_buffer - basesmuma_offset;
 }
 
+/* setup memory management and collective routines */
 
-static void load_func_with_choices(mca_bcol_base_module_t *super) 
+static void load_func(mca_bcol_base_module_t *super)
 {
     int fnc;
 
-	/* Loading memory management and collective functions */
+    /* Loading memory management and collective functions */
+
+    for (fnc = 0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
+        super->bcol_function_table[fnc] = NULL;
+    }
+
+    /*super->bcol_function_table[BCOL_BARRIER] = bcol_basesmuma_recursive_double_barrier;*/
+
+#ifdef __PORTALS_AVAIL__
+    super->bcol_function_table[BCOL_BCAST] = bcol_basesmuma_lmsg_scatter_allgather_portals_bcast;
+    /* super->bcol_function_table[BCOL_BCAST]   =
+       bcol_basesmuma_lmsg_bcast_k_nomial_anyroot; */
+#endif
+
+    /*super->bcol_function_table[BCOL_BCAST]   = bcol_basesmuma_bcast;*/
+    /*super->bcol_function_table[BCOL_BCAST]   = bcol_basesmuma_binary_scatter_allgather_segment;*/
+    /*super->bcol_function_table[BCOL_BCAST]    =  bcol_basesmuma_bcast_k_nomial_anyroot;*/
+    super->bcol_function_table[BCOL_BCAST]    =  bcol_basesmuma_bcast;
+#ifdef __PORTALS_AVAIL__
+    super->bcol_function_table[BCOL_BCAST] =
+        bcol_basesmuma_lmsg_scatter_allgather_portals_bcast;
+#endif
+    /* super->bcol_function_table[BCOL_ALLREDUCE]  = bcol_basesmuma_allreduce_intra_fanin_fanout; */
+    super->bcol_function_table[BCOL_ALLREDUCE]  = bcol_basesmuma_allreduce_intra_recursive_doubling;
+    super->bcol_function_table[BCOL_REDUCE]  = bcol_basesmuma_reduce_intra_fanin_old;
+    /* memory management */
+    super->bcol_memory_init                  = bcol_basesmuma_bank_init_opti;
+
+    super->k_nomial_tree                     = bcol_basesmuma_setup_knomial_tree;
+
+    /* Set thresholds */
+    super->set_small_msg_thresholds = bcol_basesmuma_set_small_msg_thresholds;
+}
+
+static void load_func_with_choices(mca_bcol_base_module_t *super)
+{
+    int fnc;
+
+    /* Loading memory management and collective functions */
 
     for (fnc=0; fnc < BCOL_NUM_OF_FUNCTIONS; fnc++) {
         super->bcol_function_init_table[fnc] = NULL;
@@ -197,45 +678,40 @@ static void load_func_with_choices(mca_bcol_base_module_t *super)
     super->bcol_function_init_table[BCOL_FANOUT] = bcol_basesmuma_fanout_init;
     super->bcol_function_init_table[BCOL_BARRIER] = bcol_basesmuma_barrier_init;
 
- 	super->bcol_function_init_table[BCOL_BCAST]  = bcol_basesmuma_bcast_init;
-    super->bcol_function_init_table[BCOL_ALLREDUCE]  = NULL;
- 	super->bcol_function_init_table[BCOL_REDUCE]  = NULL;
- 	super->bcol_function_init_table[BCOL_GATHER]  = NULL;
- 	super->bcol_function_init_table[BCOL_ALLGATHER]  = NULL;
- 	super->bcol_function_init_table[BCOL_SYNC]  = bcol_basesmuma_memsync_init;
+    super->bcol_function_init_table[BCOL_BCAST]  = bcol_basesmuma_bcast_init;
+    super->bcol_function_init_table[BCOL_ALLREDUCE]  = bcol_basesmuma_allreduce_init;
+    super->bcol_function_init_table[BCOL_REDUCE]  = bcol_basesmuma_reduce_init;
+    super->bcol_function_init_table[BCOL_GATHER]  = bcol_basesmuma_gather_init;
+    super->bcol_function_init_table[BCOL_ALLGATHER]  = bcol_basesmuma_allgather_init;
+    super->bcol_function_init_table[BCOL_SYNC]  = bcol_basesmuma_memsync_init;
     /* memory management */
     super->bcol_memory_init                  = bcol_basesmuma_bank_init_opti;
-    
+
     super->k_nomial_tree                     = bcol_basesmuma_setup_knomial_tree;
-    
-    /* Set thresholds */
-    super->set_small_msg_thresholds = bcol_basesmuma_set_small_msg_thresholds;
 
 }
 
 static int load_recursive_knomial_info(mca_bcol_basesmuma_module_t
-				*sm_module)
+                                       *sm_module)
 {
-	    int rc = OMPI_SUCCESS;
-	    rc = netpatterns_setup_recursive_knomial_tree_node(
-						                    sm_module->super.sbgp_partner_module->group_size,
-   			              					sm_module->super.sbgp_partner_module->my_index,
-            		     					mca_bcol_basesmuma_component.k_nomial_radix,
-                  							 &sm_module->knomial_exchange_tree);
-		return rc;
+    int rc = OMPI_SUCCESS;
+    rc = netpatterns_setup_recursive_knomial_tree_node(sm_module->super.sbgp_partner_module->group_size,
+                                                       sm_module->super.sbgp_partner_module->my_index,
+                                                       mca_bcol_basesmuma_component.k_nomial_radix,
+                                                       &sm_module->knomial_exchange_tree);
+    return rc;
 }
 
 
 int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super)
 {
     mca_bcol_basesmuma_module_t *sm_module = (mca_bcol_basesmuma_module_t *) super;
-    
-    return netpatterns_setup_recursive_knomial_allgather_tree_node(
-            sm_module->super.sbgp_partner_module->group_size,
-            sm_module->super.sbgp_partner_module->my_index,
-            mca_bcol_basesmuma_component.k_nomial_radix,
-            super->list_n_connected,
-            &sm_module->knomial_allgather_tree);
+
+    return netpatterns_setup_recursive_knomial_allgather_tree_node(sm_module->super.sbgp_partner_module->group_size,
+                                                                   sm_module->super.sbgp_partner_module->my_index,
+                                                                   mca_bcol_basesmuma_component.k_nomial_radix,
+                                                                   super->list_n_connected,
+                                                                   &sm_module->knomial_allgather_tree);
 }
 
 
@@ -262,12 +738,12 @@ mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
     /*mca_base_component_list_item_t *hdl_cli = NULL;*/
     /*int hdl_num;*/
 
-    /* at this point I think there is only a sinle shared 
+    /* at this point I think there is only a sinle shared
        memory bcol that we need to be concerned with */
 
     /* No group, no modules */
     if (OPAL_UNLIKELY(NULL == module)) {
-	    return NULL;
+        return NULL;
     }
 
     /* allocate and initialize an sm_bcol module */
@@ -279,110 +755,108 @@ mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
     (*num_modules)=1;
     cs->super.n_net_contexts = *num_modules;
     sm_modules = (mca_bcol_base_module_t **) malloc((cs->super.n_net_contexts)*
-		    sizeof(mca_bcol_base_module_t *));
-   
+                                                    sizeof(mca_bcol_base_module_t *));
+
     if( !sm_modules ) {
-	    fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers failed to allocate memory for sm_modules\n");
-	    fflush(stderr);
-	    return NULL;
+        fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers failed to allocate memory for sm_modules\n");
+        fflush(stderr);
+        return NULL;
     }
 
     sm_module->reduction_tree = NULL;
     sm_module->fanout_read_tree = NULL;
 
     ret=netpatterns_setup_recursive_doubling_tree_node(
-        module->group_size,module->my_index,
-        &(sm_module->recursive_doubling_tree));
+                                                       module->group_size,module->my_index,
+                                                       &(sm_module->recursive_doubling_tree));
     if(OMPI_SUCCESS != ret) {
-	    fprintf(stderr,"Error setting up recursive_doubling_tree \n");
-	    fflush(stderr);
-	    return NULL;
+        fprintf(stderr,"Error setting up recursive_doubling_tree \n");
+        fflush(stderr);
+        return NULL;
     }
 
     /* setup the fanin tree - this is used only as part of a hierarchical
      *   barrier, so will set this up with rank 0 as the root */
     my_rank=module->my_index;
     ret=netpatterns_setup_narray_tree(cs->radix_fanin,
-        my_rank,module->group_size,&(sm_module->fanin_node));
+                                      my_rank,module->group_size,&(sm_module->fanin_node));
     if(OMPI_SUCCESS != ret) {
-	    fprintf(stderr,"Error setting up fanin tree \n");
-	    fflush(stderr);
-	    return NULL;
+        fprintf(stderr,"Error setting up fanin tree \n");
+        fflush(stderr);
+        return NULL;
     }
 
     /* setup the fanout tree - this is used only as part of a hierarchical
      *   barrier, so will set this up with rank 0 as the root */
     ret=netpatterns_setup_narray_tree(cs->radix_fanout,
-        my_rank,module->group_size,&(sm_module->fanout_node));
+                                      my_rank,module->group_size,&(sm_module->fanout_node));
     if(OMPI_SUCCESS != ret) {
-	    fprintf(stderr,"Error setting up fanout tree \n");
-	    fflush(stderr);
-	    return NULL;
+        fprintf(stderr,"Error setting up fanout tree \n");
+        fflush(stderr);
+        return NULL;
     }
 
-    /* 
+    /*
      * Setup the broadcast tree - this is used only as part of a hierarchical
      * bcast, so will set this up with rank 0 as the root.
      */
 
-      /* set the radix of the bcast tree */
-      bcast_radix = cs->radix_read_tree;
+    /* set the radix of the bcast tree */
+    bcast_radix = cs->radix_read_tree;
 
-       /* initialize fan-out read tree */
-       sm_module->fanout_read_tree=(netpatterns_tree_node_t*) malloc(
-               sizeof(netpatterns_tree_node_t)*module->group_size);
-       if( NULL == sm_module->fanout_read_tree ) {
-           goto Error;
-       }
+    /* initialize fan-out read tree */
+    sm_module->fanout_read_tree=(netpatterns_tree_node_t*) malloc(
+                                                                  sizeof(netpatterns_tree_node_t)*module->group_size);
+    if( NULL == sm_module->fanout_read_tree ) {
+        goto Error;
+    }
 
-       for(i = 0; i < module->group_size; i++){
-          ret = netpatterns_setup_narray_tree(bcast_radix,
-                  i, module->group_size, &(sm_module->fanout_read_tree[i]));
-          if(OMPI_SUCCESS != ret) {
-              goto Error;
-          }
-      }
+    for(i = 0; i < module->group_size; i++){
+        ret = netpatterns_setup_narray_tree(bcast_radix,
+                                            i, module->group_size, &(sm_module->fanout_read_tree[i]));
+        if(OMPI_SUCCESS != ret) {
+            goto Error;
+        }
+    }
 
-	ret = load_recursive_knomial_info(sm_module);
+    ret = load_recursive_knomial_info(sm_module);
     if (OMPI_SUCCESS != ret) {
-		BASESMUMA_VERBOSE(10, ("Failed to load recursive knomial tree"));
+        BASESMUMA_VERBOSE(10, ("Failed to load recursive knomial tree"));
         goto Error;
-	}
+    }
 
-	/* Allocate offsets array for lmsg reduce */
-    /*
-	ret = alloc_lmsg_reduce_offsets_array(sm_module);
-	if (OMPI_SUCCESS != ret) {
-		BASESMUMA_VERBOSE(10, ("Failed to allocate reduce offsets array"));
+    /* Allocate offsets array for lmsg reduce */
+    ret = alloc_lmsg_reduce_offsets_array(sm_module);
+    if (OMPI_SUCCESS != ret) {
+        BASESMUMA_VERBOSE(10, ("Failed to allocate reduce offsets array"));
         goto Error;
-	}
-    */
+    }
 
     /* initialize reduction tree */
     sm_module->reduction_tree=(netpatterns_tree_node_t *) malloc(
-            sizeof(netpatterns_tree_node_t )*module->group_size);
-    if( NULL == sm_module->reduction_tree ) { 
+                                                                 sizeof(netpatterns_tree_node_t )*module->group_size);
+    if( NULL == sm_module->reduction_tree ) {
         goto Error;
     }
-        
+
     ret=netpatterns_setup_multinomial_tree(
-            cs->order_reduction_tree,module->group_size,
-            sm_module->reduction_tree);
+                                           cs->order_reduction_tree,module->group_size,
+                                           sm_module->reduction_tree);
     if( MPI_SUCCESS != ret ) {
         goto Error;
     }
 
     /* get largest power of k for given group size */
     sm_module->pow_k_levels = pow_sm_k(cs->k_nomial_radix,
-            sm_module->super.sbgp_partner_module->group_size, 
-            &(sm_module->pow_k));
+                                       sm_module->super.sbgp_partner_module->group_size,
+                                       &(sm_module->pow_k));
 
     /* get largest power of 2 for a given group size
-     * used in scatter allgather 
+     * used in scatter allgather
      */
     sm_module->pow_2_levels = pow_sm_k(2,
-            sm_module->super.sbgp_partner_module->group_size, 
-            &(sm_module->pow_2));
+                                       sm_module->super.sbgp_partner_module->group_size,
+                                       &(sm_module->pow_2));
 
     /*
      * setup scatter data
@@ -390,22 +864,22 @@ mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
     sm_module->scatter_kary_radix=cs->scatter_kary_radix;
     sm_module->scatter_kary_tree=NULL;
     ret=netpatterns_setup_narray_tree_contigous_ranks(
-            sm_module->scatter_kary_radix,
-            sm_module->super.sbgp_partner_module->group_size,
-            &(sm_module->scatter_kary_tree));
+                                                      sm_module->scatter_kary_radix,
+                                                      sm_module->super.sbgp_partner_module->group_size,
+                                                      &(sm_module->scatter_kary_tree));
     if(OMPI_SUCCESS != ret) {
-	    fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers and scatter k-ary tree setup failed \n");
-	    fflush(stderr);
-	    return NULL;
+        fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers and scatter k-ary tree setup failed \n");
+        fflush(stderr);
+        return NULL;
     }
 
     /* setup the module shared memory management */
     ret=base_bcol_basesmuma_setup_library_buffers(sm_module, cs);
-   
+
     if(OMPI_SUCCESS != ret) {
-	    fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers and mpool was not successfully setup!\n");
-	    fflush(stderr);
-	    return NULL;
+        fprintf(stderr,"In base_bcol_masesmuma_setup_library_buffers and mpool was not successfully setup!\n");
+        fflush(stderr);
+        return NULL;
     }
 
     /* setup the collectives and memory management */
@@ -413,70 +887,89 @@ mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
     /* check to see whether or not the mpool has been inited */
     /* allocate some space for the network contexts */
     if(!cs->mpool_inited) {
-	    /* if it's empty, then fill it for first time */
-	    cs->super.network_contexts = (bcol_base_network_context_t **)
-		    malloc((cs->super.n_net_contexts)*
-				    sizeof(bcol_base_network_context_t *));
-	    /* you need to do some basic setup - define the file name, 
-	     * set data seg alignment and size of cntl structure in sm
-	     * file.
-	     */
-	    /* give the payload sm file a name */
-	    name_length=asprintf(&name,
-			    "%s"OPAL_PATH_SEP"0%s%0d",
-			    ompi_process_info.job_session_dir,
-                            cs->payload_base_fname,
-                            (int)getpid());
-	    if( 0 > name_length ) {
-		    fprintf(stderr,"Failed to assign the shared memory payload file a name\n");
-		    fflush(stderr);	    
-		    return NULL;
-	    }
-	    /* make sure name is not too long */
-	    if ( OPAL_PATH_MAX < (name_length-1) ) {
-		    fprintf(stderr,"Shared memory file name is too long!\n");
-		    fflush(stderr);
-		    return NULL;
-	    }
-	    /* set the name and alignment characteristics */
-	    sm_reg_data = (bcol_basesmuma_registration_data_t *) malloc(
-			    sizeof(bcol_basesmuma_registration_data_t));
-	    sm_reg_data->file_name = name;
+        /* if it's empty, then fill it for first time */
+        cs->super.network_contexts = (bcol_base_network_context_t **)
+            malloc((cs->super.n_net_contexts)*
+                   sizeof(bcol_base_network_context_t *));
+        /* you need to do some basic setup - define the file name,
+         * set data seg alignment and size of cntl structure in sm
+         * file.
+         */
+        /* give the payload sm file a name */
+        name_length=asprintf(&name,
+                             "%s"OPAL_PATH_SEP"0%s%0d",
+                             ompi_process_info.job_session_dir,
+                             cs->payload_base_fname,
+                             (int)getpid());
+        if( 0 > name_length ) {
+            fprintf(stderr,"Failed to assign the shared memory payload file a name\n");
+            fflush(stderr);
+            return NULL;
+        }
+        /* make sure name is not too long */
+        if ( OPAL_PATH_MAX < (name_length-1) ) {
+            fprintf(stderr,"Shared memory file name is too long!\n");
+            fflush(stderr);
+            return NULL;
+        }
+        /* set the name and alignment characteristics */
+        sm_reg_data = (bcol_basesmuma_registration_data_t *) malloc(
+                                                                    sizeof(bcol_basesmuma_registration_data_t));
+        sm_reg_data->file_name = name;
 
-	    sm_reg_data->data_seg_alignment = getpagesize();
-	    sm_reg_data->size_ctl_structure = 0;
-	    cs->super.network_contexts[0] = (bcol_base_network_context_t *)
-		    malloc(sizeof(bcol_base_network_context_t));
-	    cs->super.network_contexts[0]->context_data =
-		    (void *) sm_reg_data;
-	    cs->super.network_contexts[0]->
-		    register_memory_fn = mca_bcol_basesmuma_register_sm;
-	    cs->super.network_contexts[0]->
-		    deregister_memory_fn = mca_bcol_basesmuma_deregister_sm;
-	    sm_module->super.network_context = cs->super.network_contexts[0];
+        sm_reg_data->data_seg_alignment = getpagesize();
+        sm_reg_data->size_ctl_structure = 0;
+        cs->super.network_contexts[0] = (bcol_base_network_context_t *)
+            malloc(sizeof(bcol_base_network_context_t));
+        cs->super.network_contexts[0]->context_data =
+            (void *) sm_reg_data;
+        cs->super.network_contexts[0]->
+            register_memory_fn = mca_bcol_basesmuma_register_sm;
+        cs->super.network_contexts[0]->
+            deregister_memory_fn = mca_bcol_basesmuma_deregister_sm;
+        sm_module->super.network_context = cs->super.network_contexts[0];
     } else {
-	    
-	    sm_module->super.network_context = cs->super.network_contexts[0];
+
+        sm_module->super.network_context = cs->super.network_contexts[0];
     }
 
     /* Set the header size */
     sm_module->super.header_size = sizeof(mca_bcol_basesmuma_header_t);
 
+    /*initialize the hdl module if it's to be enabled*/
+#if 0
+    if (module->use_hdl) {
+        sm_module->super.use_hdl = module->use_hdl;
+        hdl_cli = (mca_base_component_list_item_t *)
+            opal_list_get_first(&mca_hdl_base_components_in_use);
+        sm_module->hdl_module = ((mca_hdl_base_component_t*)
+                                 hdl_cli->cli_component)->hdl_comm_query(sm_module, &hdl_num);
+        if (1 != hdl_num || sm_module->hdl_module == NULL) {
+            ML_ERROR(("hdl modules are not successfully initialized!\n"));
+            goto Error;
+        }
+    } else {
+        sm_module->hdl_module = NULL;
+    }
+#else
+    sm_module->hdl_module = NULL;
+#endif
+
 
- 
     /* collective setup */
+    load_func(&(sm_module->super));
     load_func_with_choices(&(sm_module->super));
 
-	/*
-	 * This initializes all collective algorithms 
-	 */
-	
-	ret = mca_bcol_base_bcol_fns_table_init(&(sm_module->super));
-	
-	if (OMPI_SUCCESS != ret) {
-		
-		goto Error;
-	}
+    /*
+     * This initializes all collective algorithms
+     */
+
+    ret = mca_bcol_base_bcol_fns_table_init(&(sm_module->super));
+
+    if (OMPI_SUCCESS != ret) {
+
+        goto Error;
+    }
 
     sm_module->super.supported_mode = 0;
 
@@ -487,61 +980,61 @@ mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules)
     }
 #endif
 
-	/* Initializes portals library required for basesmuma large message */
+    /* Initializes portals library required for basesmuma large message */
 #ifdef __PORTALS_AVAIL__
     /* Enable zero copy mode */
     sm_module->super.supported_mode = MCA_BCOL_BASE_ZERO_COPY;
 
-	ret = mca_bcol_basesmuma_portals_init(cs);
-	if (OMPI_SUCCESS != ret) {
-		return NULL;	
-	}
-
-	sm_module->sg_state.phase = INIT;
-	
-	ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
-				cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
-				PTL_EQ_HANDLER_NONE, &sm_module->sg_state.read_eq);
-
-	if (ret != PTL_OK) {
-	    BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d",ret));
-		return NULL;
+    ret = mca_bcol_basesmuma_portals_init(cs);
+    if (OMPI_SUCCESS != ret) {
+        return NULL;
     }
 
-#endif 	
+    sm_module->sg_state.phase = INIT;
+
+    ret = PtlEQAlloc(((mca_bcol_basesmuma_portal_proc_info_t*)
+                      cs->portals_info)->ni_h, MAX_PORTAL_EVENTS_IN_Q,
+                     PTL_EQ_HANDLER_NONE, &sm_module->sg_state.read_eq);
+
+    if (ret != PTL_OK) {
+        BASESMUMA_VERBOSE(10,( "PtlEQAlloc() failed: %d",ret));
+        return NULL;
+    }
+
+#endif
     /* blocking recursive double barrier test */
     /*
-    {
-	    fprintf(stderr,"BBB About to hit the barrier test\n");
-	    fflush(stderr);
-	    int rc;
-	    bcol_function_args_t bogus;
-	    rc = bcol_basesmuma_rd_barrier_init(&(sm_module->super));
-	    rc = bcol_basesmuma_recursive_double_barrier(
-			    &bogus, &(sm_module->super));
-    }
+      {
+      fprintf(stderr,"BBB About to hit the barrier test\n");
+      fflush(stderr);
+      int rc;
+      bcol_function_args_t bogus;
+      rc = bcol_basesmuma_rd_barrier_init(&(sm_module->super));
+      rc = bcol_basesmuma_recursive_double_barrier(
+      &bogus, &(sm_module->super));
+      }
     */
 
-      /* in this case we only expect a single network context. 
+    /* in this case we only expect a single network context.
        in the future we should loop around this */
     sm_modules[0] = &(sm_module->super);
 
 #if 0
-/* debug  */
-/* test resource recycling */
-test_sm_module=sm_module;
-/* debug */
-fprintf(stderr," ZZZZ sn %lld \n",sm_module->squence_number_offset);
-fflush(stderr);
-/* end debug */
-test_resrouce_recycle();
+    /* debug  */
+    /* test resource recycling */
+    test_sm_module=sm_module;
+    /* debug */
+    fprintf(stderr," ZZZZ sn %lld \n",sm_module->squence_number_offset);
+    fflush(stderr);
+    /* end debug */
+    test_resrouce_recycle();
 
-/* end debug */
+    /* end debug */
 #endif
 
     return sm_modules;
 
-Error:
+ Error:
 
     /* cleanup */
     if( sm_module->reduction_tree ) {
@@ -560,16 +1053,16 @@ Error:
  */
 static int
 basesmuma_module_enable(mca_bcol_base_module_t *module,
-                         struct ompi_communicator_t *comm)
+                        struct ompi_communicator_t *comm)
 {
     /* local variables */
     char output_buffer[2*MPI_MAX_OBJECT_NAME];
 
     memset(&output_buffer[0],0,sizeof(output_buffer));
     snprintf(output_buffer,sizeof(output_buffer),"%s (cid %d)", comm->c_name,
-                       comm->c_contextid);
+             comm->c_contextid);
     opal_output_verbose(10, ompi_bcol_base_framework.framework_output,
-            "bcol:basesmuma:enable: new communicator: %s", output_buffer);
+                        "bcol:basesmuma:enable: new communicator: %s", output_buffer);
 
     /* All done */
     return OMPI_SUCCESS;
@@ -577,8 +1070,6 @@ basesmuma_module_enable(mca_bcol_base_module_t *module,
 #endif
 
 OBJ_CLASS_INSTANCE(mca_bcol_basesmuma_module_t,
-        mca_bcol_base_module_t,
-        mca_bcol_basesmuma_module_construct,
-        mca_bcol_basesmuma_module_destruct);
-
-
+                   mca_bcol_base_module_t,
+                   mca_bcol_basesmuma_module_construct,
+                   mca_bcol_basesmuma_module_destruct);
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_barrier.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_barrier.c
new file mode 100644
index 0000000000..4b95fec384
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_barrier.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/* Recursive doubling blocking barrier */
+
+#include "ompi_config.h"
+#include "ompi/constants.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "ompi/patterns/net/netpatterns.h"
+
+#include "opal/sys/atomic.h"
+
+#include "bcol_basesmuma.h"
+
+#if 0
+int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args,
+                                            coll_ml_function_t *c_input_args)
+{
+
+    /* local variables */
+    int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange, flag_to_set;
+    int pair_rank, flag_offset;
+    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
+    netpatterns_pair_exchange_node_t *my_exchange_node;
+    int extra_rank, my_rank, pow_2;
+    volatile mca_bcol_basesmuma_ctl_struct_t *partner_ctl;
+    volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl;
+    int64_t sequence_number;
+    bool found;
+    int buff_index, first_instance=0;
+    mca_bcol_basesmuma_module_t* bcol_module =
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+#if 0
+    fprintf(stderr,"Entering the sm rd barrier\n");
+    fflush(stderr);
+#endif
+
+    /* get the pointer to the segment of control structures */
+    my_exchange_node=&(bcol_module->recursive_doubling_tree);
+    my_rank=bcol_module->super.sbgp_partner_module->my_index;
+    pow_2=bcol_module->super.sbgp_partner_module->pow_2;
+
+    /* figure out what instance of the basesmuma bcol I am */
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    sequence_number=input_args->sequence_num - c_input_args->bcol_module->squence_number_offset;
+
+    buff_index=sequence_number & (bcol_module->colls_no_user_data.mask);
+
+    idx=SM_ARRAY_INDEX(leading_dim,buff_index,0);
+    ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+        bcol_module->colls_no_user_data.ctl_buffs+idx;
+    my_ctl=ctl_structs[my_rank];
+    if( my_ctl->sequence_number < sequence_number ) {
+        first_instance=1;
+    }
+
+    /* get the pool index */
+    if( first_instance ) {
+        idx = -1;
+        while( idx == -1 ) {
+
+            idx=bcol_basesmuma_get_buff_index(
+                &(bcol_module->colls_no_user_data),sequence_number);
+        }
+        if( -1 == idx ){
+            return ORTE_ERR_TEMP_OUT_OF_RESOURCE;
+        }
+        my_ctl->index=1;
+        /* this does not need to use any flag values , so only need to
+         * set the value for subsequent values that may need this */
+        my_ctl->starting_flag_value=0;
+        flag_offset=0;
+    } else {
+        /* only one thread at a time will be making progress on this
+         *   collective, so no need to make this atomic */
+        my_ctl->index++;
+        flag_offset=my_ctl->starting_flag_value;
+    }
+
+    /* signal that I have arrived */
+    my_ctl->flag = -1;
+    /* don't need to set this flag anymore */
+    my_ctl->sequence_number = sequence_number;
+    /* MB();*/
+
+    if(0 < my_exchange_node->n_extra_sources) {
+        if (EXCHANGE_NODE == my_exchange_node->node_type) {
+            volatile int64_t *partner_sn;
+            int cnt=0;
+
+            /* I will participate in the exchange - wait for signal from extra
+            ** process */
+            extra_rank = my_exchange_node->rank_extra_source;
+            partner_ctl=(volatile mca_bcol_basesmuma_ctl_struct_t *)ctl_structs[extra_rank];
+
+            /*partner_ctl=ctl_structs[extra_rank];*/
+            partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
+
+            /* spin n iterations until partner registers */
+            loop_cnt=0;
+            found=false;
+            while( !found )
+            {
+                if( *partner_sn >= sequence_number ) {
+                    found=true;
+                }
+                cnt++;
+                if( cnt == 1000 ) {
+                    opal_progress();
+                    cnt=0;
+                }
+            }
+
+        }  else {
+
+            /* Nothing to do, already registared that I am here */
+        }
+    }
+
+    for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) {
+
+        volatile int64_t *partner_sn;
+        volatile int *partner_flag;
+        int cnt=0;
+
+        /* rank of exchange partner */
+        pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
+        partner_ctl=ctl_structs[pair_rank];
+        partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
+        partner_flag=(volatile int *)&(partner_ctl->flag);
+
+        /* signal that I am at iteration exchange of the algorithm */
+        flag_to_set=flag_offset+exchange;
+        my_ctl->flag = flag_to_set;
+
+        /* check to see if the partner has arrived */
+
+        /* spin n iterations until partner registers */
+        found=false;
+        while( !found )
+        {
+            if( (*partner_sn > sequence_number) ||
+                ( *partner_sn == sequence_number &&
+                  *partner_flag >= flag_to_set ) ) {
+                found=true;
+            }  else {
+                cnt++;
+                if( cnt == 1000 ) {
+                    opal_progress();
+                    cnt=0;
+                }
+            }
+        }
+    }
+
+    if(0 < my_exchange_node->n_extra_sources)  {
+        if ( EXTRA_NODE == my_exchange_node->node_type ) {
+            int cnt=0;
+
+            /* I will not participate in the exchange -
+             *   wait for signal from extra partner */
+            extra_rank = my_exchange_node->rank_extra_source;
+            partner_ctl=ctl_structs[extra_rank];
+            flag_to_set=flag_offset+my_exchange_node->log_2;
+
+            /* spin n iterations until partner registers */
+            found=false;
+            while( !found )
+            {
+                if (IS_PEER_READY(partner_ctl, flag_to_set, sequence_number)){
+                    found=true;
+                } else {
+                    cnt++;
+                    if( cnt == 1000 ) {
+                        opal_progress();
+                        cnt=0;
+                    }
+                }
+            }
+
+        }  else {
+
+            /* signal the extra rank that I am done with the recursive
+             * doubling phase.
+             */
+            flag_to_set=flag_offset+my_exchange_node->log_2;
+            my_ctl->flag = flag_to_set;
+
+        }
+    }
+
+    /* if I am the last instance of a basesmuma function in this collectie,
+     *   release the resrouces */
+    if (IS_LAST_BCOL_FUNC(c_input_args)){
+        idx=bcol_basesmuma_free_buff(
+            &(bcol_module->colls_no_user_data),
+            sequence_number);
+    }  else {
+        /* increment flag value - so next sm collective in the hierarchy
+         *    will not collide with the current one, as they share the
+         *    control structure */
+        my_ctl->starting_flag_value+=(my_exchange_node->log_2+1);
+    }
+
+    /* return */
+    return ret;
+}
+#endif
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_nb_barrier.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_nb_barrier.c
index 8004f077ff..4cb66f3cd8 100644
--- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_nb_barrier.c
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_rd_nb_barrier.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2009-2010 UT-Battelle, LLC. All rights reserved.
- * Copyright (c) 2009-2010 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2009-2012 UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
  *
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.c
new file mode 100644
index 0000000000..9dc6feca52
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.c
@@ -0,0 +1,387 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/bcol/base/base.h"
+#include "ompi/mca/bcol/bcol.h"
+
+#include "opal/include/opal_stdint.h"
+
+#include "bcol_basesmuma.h"
+#include "bcol_basesmuma_reduce.h"
+/**
+ * gvm - Shared memory reduce
+ */
+
+static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args,
+                                                      coll_ml_function_t *c_input_args);
+
+int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    mca_bcol_basesmuma_module_t *basesmuma_module =
+        (mca_bcol_basesmuma_module_t *) super;
+
+    int group_size = basesmuma_module->colls_no_user_data.size_of_group;
+
+    comm_attribs.bcoll_type = BCOL_REDUCE;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 16;
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000;
+    inv_attribs.datatype_bitmap = 0x11111111;
+    inv_attribs.op_types_bitmap = 0x11111111;
+
+
+    /* Set attributes for fanin fanout algorithm */
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, bcol_basesmuma_reduce_intra_fanin,
+                                 bcol_basesmuma_reduce_intra_fanin_progress);
+
+    inv_attribs.bcol_msg_min = 10000000;
+    inv_attribs.bcol_msg_max = 10485760; /* range 4 */
+
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs, NULL, NULL);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Small data fanin reduce
+ * ML buffers are used for both payload and control structures
+ * This functions works with hierarchical allreduce and
+ * progress engine
+ */
+static inline int reduce_children (mca_bcol_basesmuma_module_t *bcol_module, volatile void *rbuf, netpatterns_tree_node_t *my_reduction_node,
+                                   int *iteration, volatile mca_bcol_basesmuma_header_t *my_ctl_pointer, ompi_datatype_t *dtype,
+                                   volatile mca_bcol_basesmuma_payload_t *data_buffs, int count, struct ompi_op_t *op, int process_shift) {
+    volatile mca_bcol_basesmuma_header_t * child_ctl_pointer;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    int64_t sequence_number = my_ctl_pointer->sequence_number;
+    int8_t ready_flag = my_ctl_pointer->ready_flag;
+    int group_size = bcol_module->colls_no_user_data.size_of_group;
+
+    if (LEAF_NODE != my_reduction_node->my_node_type) {
+        volatile char *child_data_pointer;
+        volatile void *child_rbuf;
+
+        /* for each child */
+        /* my_result_data = child_result_data (op) my_source_data */
+
+        for (int child = *iteration ; child < my_reduction_node->n_children ; ++child) {
+            int child_rank = my_reduction_node->children_ranks[child] + process_shift;
+
+            if (group_size <= child_rank){
+                child_rank -= group_size;
+            }
+
+            child_ctl_pointer = data_buffs[child_rank].ctl_struct;
+            child_data_pointer = data_buffs[child_rank].payload;
+
+            if (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) {
+                *iteration = child;
+                return BCOL_FN_STARTED;
+            }
+
+            child_rbuf = child_data_pointer + child_ctl_pointer->roffsets[bcol_id];
+
+            ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count, dtype);
+        } /* end child loop */
+    }
+
+    if (ROOT_NODE != my_reduction_node->my_node_type) {
+        opal_atomic_wmb ();
+        my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag;
+    }
+
+    return BCOL_FN_COMPLETE;
+}
+
+static int bcol_basesmuma_reduce_intra_fanin_progress(bcol_function_args_t *input_args,
+                                                      coll_ml_function_t *c_input_args)
+{
+    mca_bcol_basesmuma_module_t* bcol_module =
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+
+    netpatterns_tree_node_t *my_reduction_node;
+    int my_rank, my_node_index;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    int leading_dim, idx;
+
+    /* Buffer index */
+    int buff_idx = input_args->src_desc->buffer_index;
+
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
+
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    void *data_addr = (void *)input_args->src_desc->data_addr;
+    volatile void *rbuf;
+
+    /* get addressing information */
+    my_rank = bcol_module->super.sbgp_partner_module->my_index;
+    leading_dim = bcol_module->colls_no_user_data.size_of_group;
+    idx = SM_ARRAY_INDEX(leading_dim, buff_idx, 0);
+
+    data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs + idx;
+
+    /* Get control structure and payload buffer */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+
+    my_node_index = my_rank - input_args->root;
+    if (0 > my_node_index) {
+        int group_size = bcol_module->colls_no_user_data.size_of_group;
+        my_node_index += group_size;
+    }
+
+    my_reduction_node = bcol_module->reduction_tree + my_node_index;
+    rbuf = (volatile void *)((uintptr_t) data_addr + input_args->rbuf_offset);
+
+    return reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype,
+                            data_buffs, input_args->count, input_args->op, input_args->root);
+}
+
+int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args,
+                                      coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    int rc=BCOL_FN_COMPLETE;
+    int my_rank,group_size,my_node_index;
+    mca_bcol_basesmuma_module_t* bcol_module =
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+
+    netpatterns_tree_node_t *my_reduction_node;
+    volatile int8_t ready_flag;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+    volatile void *sbuf,*rbuf;
+    int sbuf_offset,rbuf_offset;
+    int root,count;
+    int64_t sequence_number=input_args->sequence_num;
+    struct ompi_datatype_t *dtype;
+    int leading_dim,idx;
+
+    /* Buffer index */
+    int buff_idx = input_args->src_desc->buffer_index;
+
+    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buff_idx].iteration;
+
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char * my_data_pointer;
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    void *data_addr = (void *)input_args->src_desc->data_addr;
+
+#if 0
+    fprintf(stderr,"777 entering sm reduce \n");
+#endif
+
+    /* get addressing information */
+    my_rank=bcol_module->super.sbgp_partner_module->my_index;
+    group_size=bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+    /* fprintf(stderr,"AAA the devil!!\n"); */
+    /* Get control structure and payload buffer */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+    my_data_pointer = (volatile char *)data_addr;
+
+    /* Align node index to around sbgp root */
+    root = input_args->root;
+    my_node_index = my_rank - root;
+    if (0 > my_node_index) {
+        my_node_index += group_size;
+    }
+
+    /* get arguments */
+    sbuf_offset = input_args->sbuf_offset;
+    rbuf_offset = input_args->rbuf_offset;
+    sbuf = (volatile void *)(my_data_pointer + sbuf_offset);
+    data_buffs[my_rank].payload = (void*)sbuf;
+    rbuf = (volatile void *)(my_data_pointer + rbuf_offset);
+    count = input_args->count;
+    dtype = input_args->dtype;
+
+    /* Cache my rbuf_offset */
+    my_ctl_pointer->roffsets[bcol_id] = rbuf_offset;
+
+    /* get my node for the reduction tree */
+    my_reduction_node=&(bcol_module->reduction_tree[my_node_index]);
+
+    /* init the header */
+    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
+
+    input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type);
+
+    /* set starting point for progress loop */
+    *iteration = 0;
+    my_ctl_pointer->ready_flag = ready_flag;
+
+    if (sbuf != rbuf) {
+        rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
+                                                 (char *)sbuf);
+        if( 0 != rc ) {
+            return OMPI_ERROR;
+        }
+    }
+
+    rc = reduce_children (bcol_module, rbuf, my_reduction_node, iteration, my_ctl_pointer, dtype,
+                          data_buffs, count, input_args->op, root);
+
+    /* Flag value if other bcols are called */
+    my_ctl_pointer->starting_flag_value[bcol_id]++;
+
+    /* Recycle payload buffers */
+
+    return rc;
+}
+
+/* Small data fanin reduce
+ * Uses SM buffer (backed by SM file) for both control structures and
+ * payload
+ *
+ * NTH: How does this differ from the new one? Can we replace this
+ * with a call to the new init then a call the new progress until
+ * complete?
+ */
+int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args,
+                                          coll_ml_function_t *c_input_args)
+{
+    /* local variables */
+    int rc=OMPI_SUCCESS;
+    int my_rank,group_size,process_shift,my_node_index;
+    int n_children,child;
+    mca_bcol_basesmuma_module_t* bcol_module =
+        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
+
+    netpatterns_tree_node_t *my_reduction_node;
+    volatile int8_t ready_flag;
+    volatile void *sbuf,*rbuf;
+    int sbuf_offset,rbuf_offset;
+    int root,count;
+    struct ompi_op_t *op;
+    int64_t sequence_number=input_args->sequence_num;
+    struct ompi_datatype_t *dtype;
+    int leading_dim,idx;
+    int buff_idx;
+    int child_rank;
+    int bcol_id = (int) bcol_module->super.bcol_id;
+
+    volatile mca_bcol_basesmuma_payload_t *data_buffs;
+    volatile char * my_data_pointer;
+    volatile char * child_data_pointer;
+    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
+    volatile mca_bcol_basesmuma_header_t * child_ctl_pointer;
+
+#if 0
+    fprintf(stderr,"Entering fanin reduce \n");
+#endif
+
+    /* Buffer index */
+    buff_idx = input_args->src_desc->buffer_index;
+    /* get addressing information */
+    my_rank=bcol_module->super.sbgp_partner_module->my_index;
+    group_size=bcol_module->colls_no_user_data.size_of_group;
+    leading_dim=bcol_module->colls_no_user_data.size_of_group;
+    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
+
+    /*ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
+      bcol_module->colls_with_user_data.ctl_buffs+idx;*/
+    data_buffs = (volatile mca_bcol_basesmuma_payload_t *)
+        bcol_module->colls_with_user_data.data_buffs+idx;
+
+    /* Get control structure and payload buffer */
+    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
+    my_data_pointer = (volatile char *) data_buffs[my_rank].payload;
+
+    /* Align node index to around sbgp root */
+    root = input_args->root;
+    process_shift = root;
+    my_node_index = my_rank - root;
+    if (0 > my_node_index ) {
+        my_node_index += group_size;
+    }
+
+    /* get arguments */
+    sbuf_offset = input_args->sbuf_offset;
+    rbuf_offset = input_args->rbuf_offset;
+    sbuf = (volatile void *)(my_data_pointer + sbuf_offset);
+    rbuf = (volatile void *)(my_data_pointer + rbuf_offset);
+    op   = input_args->op;
+    count = input_args->count;
+    dtype = input_args->dtype;
+
+    /* get my node for the reduction tree */
+    my_reduction_node=&(bcol_module->reduction_tree[my_node_index]);
+    n_children=my_reduction_node->n_children;
+
+    /* init the header */
+    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
+
+    input_args->result_in_rbuf = (ROOT_NODE == my_reduction_node->my_node_type);
+
+    rc = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
+                                             (char *)sbuf);
+    if (0 != rc) {
+        return OMPI_ERROR;
+    }
+
+    if (LEAF_NODE != my_reduction_node->my_node_type) {
+        volatile void *child_rbuf;
+        /* for each child */
+        /* my_result_data = child_result_data (op) my_source_data */
+
+        for (child = 0 ; child < n_children ; ++child) {
+            child_rank = my_reduction_node->children_ranks[child];
+            child_rank += process_shift;
+
+            /* wrap around */
+            if( group_size <= child_rank ){
+                child_rank-=group_size;
+            }
+
+            /*child_ctl_pointer = ctl_structs[child_rank];*/
+            child_ctl_pointer = data_buffs[child_rank].ctl_struct;
+            child_data_pointer = data_buffs[child_rank].payload;
+
+            child_rbuf = child_data_pointer + rbuf_offset;
+            /* wait until child child's data is ready for use */
+            while (!IS_PEER_READY(child_ctl_pointer, ready_flag, sequence_number, REDUCE_FLAG, bcol_id)) {
+                opal_progress();
+            }
+
+            /* apply collective operation */
+            ompi_op_reduce(op,(void *)child_rbuf,(void *)rbuf, count,dtype);
+        } /* end child loop */
+    }
+
+    if (ROOT_NODE != my_reduction_node->my_node_type) {
+        opal_atomic_wmb ();
+        my_ctl_pointer->flags[REDUCE_FLAG][bcol_id] = ready_flag;
+    }
+
+    my_ctl_pointer->starting_flag_value[bcol_id]++;
+
+    return rc;
+}
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h b/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h
new file mode 100644
index 0000000000..0788415dfb
--- /dev/null
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_reduce.h
@@ -0,0 +1,92 @@
+#ifndef __BASESMUMA_REDUCE_H_
+
+#define __BASESMUMA_REDUCE_H_
+
+#include "ompi_config.h"
+#include "ompi/mca/bcol/basesmuma/bcol_basesmuma.h"
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+#include "bcol_basesmuma_utils.h"
+#include <unistd.h>
+
+enum {
+    BLOCK_OFFSET = 0,
+    LOCAL_REDUCE_SEG_OFFSET,
+    BLOCK_COUNT,
+    SEG_SIZE,
+	NOFFSETS
+};
+
+int compute_knomial_reduce_offsets(int group_index, int count, struct
+				ompi_datatype_t *dtype,int k_radix,int n_exchanges,
+				int **offsets);
+
+int compute_knomial_reduce_offsets_reverse(int group_index, int count, struct
+				ompi_datatype_t *dtype,int k_radix,int n_exchanges,
+				int **offsets);
+
+int bcol_basesmuma_lmsg_reduce_recursivek_scatter_reduce(mca_bcol_basesmuma_module_t *sm_module,
+						const int buffer_index, void *sbuf,
+					    void *rbuf,
+						struct ompi_op_t *op,
+						const int count, struct ompi_datatype_t *dtype,
+						const int relative_group_index,
+						const int padded_start_byte,
+					volatile int8_t ready_flag,
+						volatile mca_bcol_basesmuma_payload_t *data_buffs);
+
+int bcol_basesmuma_lmsg_reduce_knomial_gather(mca_bcol_basesmuma_module_t *basesmuma_module,
+				const int buffer_index,
+				void *sbuf,void *rbuf, int count, struct
+				ompi_datatype_t *dtype,
+				const int my_group_index,
+				const int padded_start_byte,
+				volatile int8_t rflag,
+				volatile mca_bcol_basesmuma_payload_t *data_buffs);
+
+int bcol_basesmuma_lmsg_reduce_extra_root(mca_bcol_basesmuma_module_t *sm_module,
+						const int buffer_index, void *sbuf,
+					    void *rbuf,
+						struct ompi_op_t *op,
+						const int count, struct ompi_datatype_t *dtype,
+						const int relative_group_index,
+						const int padded_start_byte,
+					volatile int8_t rflag,
+						volatile mca_bcol_basesmuma_payload_t *data_buffs);
+
+
+
+int bcol_basesmuma_lmsg_reduce_extra_non_root(mca_bcol_basesmuma_module_t *sm_module,
+						const int buffer_index, void *sbuf,
+					    void *rbuf,
+						int root,
+						struct ompi_op_t *op,
+						const int count, struct ompi_datatype_t *dtype,
+						const int relative_group_index,
+						const int group_size,
+						const int padded_start_byte,
+					volatile int8_t rflag,
+						volatile mca_bcol_basesmuma_payload_t *data_buffs);
+
+int bcol_basesmuma_lmsg_reduce(bcol_function_args_t *input_args,
+        coll_ml_function_t *c_input_args);
+
+int bcol_basesmuma_lmsg_reduce_extra(bcol_function_args_t *input_args,
+        coll_ml_function_t *c_input_args);
+
+void basesmuma_reduce_recv(int my_group_index, int peer,
+						   void *recv_buffer,
+						   int recv_size,
+					   volatile int8_t ready_flag_val,
+					   volatile mca_bcol_basesmuma_payload_t *data_buffs);
+
+void  basesmuma_reduce_send(int my_group_index,
+						   int peer,
+						   void *send_buffer,
+						   int snd_size,
+						   int send_offset,
+					   volatile int8_t ready_flag_val,
+					   volatile mca_bcol_basesmuma_payload_t *data_buffs);
+
+#endif
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c
index fd11b10463..0322c642a0 100644
--- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c
@@ -1,7 +1,8 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
- * Copyright (c) 2012      Los Alamos National Security, LLC.
+ * Copyright (c) 2013-2014 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2014 Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
@@ -31,14 +32,14 @@
 #include "bcol_basesmuma.h"
 
 int base_bcol_basesmuma_setup_ctl_struct(
-    mca_bcol_basesmuma_module_t *sm_bcol_module, 
+    mca_bcol_basesmuma_module_t *sm_bcol_module,
     mca_bcol_basesmuma_component_t *cs,
     sm_buffer_mgmt *ctl_mgmt);
 
 /* this is the new one, uses the pml allgather */
 int base_bcol_basesmuma_exchange_offsets(
-    mca_bcol_basesmuma_module_t *sm_bcol_module, 
-    void **result_array, uint64_t mem_offset, int loop_limit, 
+    mca_bcol_basesmuma_module_t *sm_bcol_module,
+    void **result_array, uint64_t mem_offset, int loop_limit,
     int leading_dim)
 {
     int ret=OMPI_SUCCESS,i;
@@ -53,7 +54,7 @@ int base_bcol_basesmuma_exchange_offsets(
     send_buff = (char *) malloc(count);
     recv_buff = (char *) malloc(count *
                            sm_bcol_module->super.sbgp_partner_module->group_size);
-    /*  exchange the base pointer for the controls structures - gather 
+    /*  exchange the base pointer for the controls structures - gather
      *  every one else's infromation.
      */
 
@@ -65,7 +66,7 @@ int base_bcol_basesmuma_exchange_offsets(
     /* get the offsets from all procs, so can setup the control data
      * structures.
      */
-    
+
     ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count,
             MPI_BYTE,
             sm_bcol_module->super.sbgp_partner_module->my_index,
@@ -113,8 +114,8 @@ exit_ERROR:
 
 #if 0
 int base_bcol_basesmuma_exchange_offsets(
-    mca_bcol_basesmuma_module_t *sm_bcol_module, 
-    void **result_array, uint64_t mem_offset, int loop_limit, 
+    mca_bcol_basesmuma_module_t *sm_bcol_module,
+    void **result_array, uint64_t mem_offset, int loop_limit,
     int leading_dim)
 {
     int ret=OMPI_SUCCESS,i,dummy;
@@ -126,7 +127,7 @@ int base_bcol_basesmuma_exchange_offsets(
     opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t);
     uint64_t rem_mem_offset;
 
-    /*  exchange the base pointer for the controls structures - gather 
+    /*  exchange the base pointer for the controls structures - gather
      *  every one else's infromation.
      */
     /* get list of procs that will participate in the communication */
@@ -157,8 +158,8 @@ int base_bcol_basesmuma_exchange_offsets(
         &(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32);
 
     if (OMPI_SUCCESS != ret) {
-	fprintf(stderr,"Error packing my_index!!\n");
-	fflush(stderr);
+        fprintf(stderr,"Error packing my_index!!\n");
+        fflush(stderr);
         goto exit_ERROR;
     }
 
@@ -253,7 +254,7 @@ exit_ERROR:
 
 
 static int base_bcol_basesmuma_exchange_ctl_params(
-    mca_bcol_basesmuma_module_t *sm_bcol_module, 
+    mca_bcol_basesmuma_module_t *sm_bcol_module,
     mca_bcol_basesmuma_component_t *cs,
     sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk)
 {
@@ -281,7 +282,7 @@ static int base_bcol_basesmuma_exchange_ctl_params(
     }
 
 #if 0
-    ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module, 
+    ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module,
             (void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim);
     if( OMPI_SUCCESS != ret ) {
         goto exit_ERROR;
@@ -332,7 +333,7 @@ exit_ERROR:
 }
 
 int base_bcol_basesmuma_setup_ctl_struct(
-    mca_bcol_basesmuma_module_t *sm_bcol_module, 
+    mca_bcol_basesmuma_module_t *sm_bcol_module,
     mca_bcol_basesmuma_component_t *cs,
     sm_buffer_mgmt *ctl_mgmt)
 {
@@ -351,8 +352,8 @@ int base_bcol_basesmuma_setup_ctl_struct(
 
     /* initialize the control structure management struct -
      * for collectives without user data
-     *--------------------------------------------------------------- 
-     */ 
+     *---------------------------------------------------------------
+     */
 
     ctl_mgmt->number_of_buffs=n_ctl_structs;
     ctl_mgmt->num_mem_banks=
@@ -391,7 +392,7 @@ int base_bcol_basesmuma_setup_ctl_struct(
         sm_bcol_module,
         sm_bcol_module->super.sbgp_partner_module,
         &(cs->sm_connections_list),
-	&(sm_bcol_module->ctl_backing_files_info),
+        &(sm_bcol_module->ctl_backing_files_info),
         sm_bcol_module->super.sbgp_partner_module->group_comm,
         input_file, cs->clt_base_fname,
         false);
@@ -406,23 +407,21 @@ int base_bcol_basesmuma_setup_ctl_struct(
         ret = OMPI_ERR_OUT_OF_RESOURCE;
         goto exit_ERROR;
     }
-    for(i=0 ; i < sm_bcol_module->super.sbgp_partner_module->group_size ; i++ )
-        {
-            if(i ==
-                     sm_bcol_module->super.sbgp_partner_module->my_index) {
-                /* local file data is not cached in thi slist */
-                continue;
-            }
-            sm_bcol_module->shared_memory_scratch_space[i]=(void *)(
-                (char *)(sm_bcol_module->ctl_backing_files_info[i]->sm_mmap)+
-                cs->scratch_offset_from_base_ctl_file);
-        }
+    for(i=0 ; i < sm_bcol_module->super.sbgp_partner_module->group_size ; i++ ) {
+	if(i == sm_bcol_module->super.sbgp_partner_module->my_index) {
+	    /* local file data is not cached in thi slist */
+	    continue;
+	}
+	sm_bcol_module->shared_memory_scratch_space[i]=(void *)(
+	    (char *)(sm_bcol_module->ctl_backing_files_info[i]->sm_mmap)+
+	    cs->scratch_offset_from_base_ctl_file);
+    }
     i=sm_bcol_module->super.sbgp_partner_module->my_index;
     sm_bcol_module->shared_memory_scratch_space[i]=(void *)(
         (char *)(cs->sm_ctl_structs->map_addr)+cs->scratch_offset_from_base_ctl_file);
 
     /*
-     * setup the no-data buffer managment data 
+     * setup the no-data buffer managment data
      */
     n_ctl=ctl_mgmt->num_mem_banks;
     ctl_mgmt->ctl_buffs_mgmt=(mem_bank_management_t *)
@@ -448,13 +447,13 @@ int base_bcol_basesmuma_setup_ctl_struct(
         mutex_ptr= &(ctl_mgmt->ctl_buffs_mgmt[i].mutex);
         OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t);
         ctl_mgmt->ctl_buffs_mgmt[i].index_shared_mem_ctl_structs=i;
-        
+
         item=(opal_list_item_t *)&(ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc);
         OBJ_CONSTRUCT(item,opal_list_item_t);
         ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.sm_module=
             sm_bcol_module;
         ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.pool_index= i;
-	/* get the sm_buffer_mgmt pointer for the control structures */ 
+        /* get the sm_buffer_mgmt pointer for the control structures */
         ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.coll_buff=ctl_mgmt;
         ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor=
             NULL;
@@ -473,10 +472,10 @@ exit_ERROR:
 /*
  * this function initializes the internal scratch buffers and control
  * structures that will be used by the module. It also intitializes
- * the payload buffer management structures. 
+ * the payload buffer management structures.
  */
 int base_bcol_basesmuma_setup_library_buffers(
-    mca_bcol_basesmuma_module_t *sm_bcol_module, 
+    mca_bcol_basesmuma_module_t *sm_bcol_module,
     mca_bcol_basesmuma_component_t *cs)
 {
     int ret=OMPI_SUCCESS,i;
diff --git a/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c b/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c
index 276303131e..bc895b58a4 100644
--- a/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c
+++ b/ompi/mca/bcol/basesmuma/bcol_basesmuma_smcm.c
@@ -179,7 +179,7 @@ int bcol_basesmuma_smcm_allgather_connection(
 	if( !backing_files ) {
             rc=OMPI_ERR_OUT_OF_RESOURCE;
             goto Error;
-        }
+    }
         *back_files=backing_files;
 
         /* check to see if we have already mapped all the files, if we have
diff --git a/ompi/mca/bcol/bcol.h b/ompi/mca/bcol/bcol.h
index e11105f45e..2ebb043f42 100644
--- a/ompi/mca/bcol/bcol.h
+++ b/ompi/mca/bcol/bcol.h
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -43,6 +46,10 @@ struct mca_bcol_base_coll_fn_desc_t;
 #define MSG_RANGE_INITIAL (1024)*12
 #define MSG_RANGE_INC      10
 #define BCOL_THRESHOLD_UNLIMITED (INT_MAX)
+/* Maximum size of a bcol's header. This allows us to correctly calculate the message
+ * thresholds. If the header of any bcol exceeds this value then increase this one
+ * to match. */
+#define BCOL_HEADER_MAX 96
 
 #define BCOL_HEAD_ALIGN 32   /* will turn into an MCA parameter after debug */
 
@@ -115,30 +122,6 @@ enum {
     BCOL_FN_COMPLETE    = (OMPI_ERR_MAX - 3)
 };
 
-/* Originally this enum was placed in ompi/op/op.h file. It should be moved back
- * when we are ready to lobby for its inclusion. Since we are releasing only the
- * bcast and barrier initially and this struct supports the allreduce, we are not 
- * going to worry about it now. Note that in the same h-file, op.h, the struct "ompi_op_t"
- * also has a field that we introduced called "enum ompi_op_type op_type" that this needs to
- * be resolved also.
- */
-enum ompi_op_type {
-    OMPI_OP_NULL,
-    OMPI_OP_MAX,
-    OMPI_OP_MIN,
-    OMPI_OP_SUM,
-    OMPI_OP_PROD,
-    OMPI_OP_LAND,
-    OMPI_OP_BAND,
-    OMPI_OP_LOR,
-    OMPI_OP_BOR,
-    OMPI_OP_LXOR,
-    OMPI_OP_BXOR,
-    OMPI_OP_MAXLOC,
-    OMPI_OP_MINLOC,
-    OMPI_OP_REPLACE,
-    OMPI_OP_NUM_OF_TYPES
-};
 
 
 /**
@@ -344,6 +327,24 @@ typedef struct {
     int n_fns_need_ordering; /* The number of functions are called for bcols need ordering */
 } mca_bcol_base_order_info_t;
 
+/* structure that encapsultes information propagated amongst multiple
+ * fragments whereby completing the entire ensemble of fragments is
+ * necessary in order to complete the entire collective
+ */
+struct bcol_fragment_descriptor_t {
+    /* start iterator */
+    int head;
+    /* end iterator */
+    int tail;
+    /* current iteration */
+    int start_iter;
+    /* number of full iterations this frag */
+    int num_iter;
+    /* end iter */
+    int end_iter;
+};
+typedef struct bcol_fragment_descriptor_t bcol_fragment_descriptor_t;
+
 struct bcol_function_args_t {
     /* full message sequence number */
     int64_t sequence_num;
@@ -373,16 +374,19 @@ struct bcol_function_args_t {
     int rbuf_offset;
     /* for bcol opaque data */
     void *bcol_opaque_data;
-    /* An output argument that will be used by BCOL funstion to tell ML that the result of the BCOL is in rbuf */
+    /* An output argument that will be used by BCOL function to tell ML that the result of the BCOL is in rbuf */
     bool result_in_rbuf;
-    bool root_flag; /* True if the rank is root of operation */
-    int status; /* Used for non-blocking collective completion */
-    uint32_t frag_size; /* fragment size for large messages */
-    int hier_factor; /* factor used when bcast is invoked as a service function back down
-                      *  the tree in allgather for example, the pacl_len is not the actual
-                      *  len of the data needing bcasting
-                       */
+    bool root_flag;      /* True if the rank is root of operation */
+    bool need_dt_support; /* will trigger alternate code path for some colls */
+    int status;          /* Used for non-blocking collective completion */
+    uint32_t frag_size;  /* fragment size for large messages */
+    int hier_factor;     /* factor used when bcast is invoked as a service function back down
+                          * the tree in allgather for example, the pacl_len is not the actual
+                          * len of the data needing bcasting
+                          */
     mca_bcol_base_order_info_t order_info;
+    bcol_fragment_descriptor_t frag_info;
+
 };
 
 typedef struct bcol_function_args_t bcol_function_args_t;
@@ -658,6 +662,15 @@ struct mca_bcol_base_descriptor_t {
 };
 typedef struct mca_bcol_base_descriptor_t mca_bcol_base_descriptor_t;
 
+static inline __opal_attribute_always_inline__ size_t
+             mca_bcol_base_get_buff_length(ompi_datatype_t *dtype, int count)
+{
+    ptrdiff_t lb, extent;
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+
+    return (size_t) (extent * count);
+}
+
 #define MCA_BCOL_CHECK_ORDER(module, bcol_function_args)                     \
     do {                                                                     \
         if (*((module)->next_inorder) !=                                     \
diff --git a/ompi/mca/bcol/iboffload/Makefile.am b/ompi/mca/bcol/iboffload/Makefile.am
index 3a80932cb0..3d1fc78ee0 100644
--- a/ompi/mca/bcol/iboffload/Makefile.am
+++ b/ompi/mca/bcol/iboffload/Makefile.am
@@ -31,12 +31,14 @@ sources = \
         bcol_iboffload_barrier.c \
         bcol_iboffload_bcast.h \
         bcol_iboffload_bcast.c \
+        bcol_iboffload_allgather.c \
         bcol_iboffload_collreq.h \
         bcol_iboffload_collreq.c \
         bcol_iboffload_qp_info.c \
         bcol_iboffload_qp_info.h \
         bcol_iboffload_fanin.c \
-        bcol_iboffload_fanout.c 
+        bcol_iboffload_fanout.c \
+        bcol_iboffload_allreduce.c
 
 # Make the output library in this directory, and name it either
 # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_allgather.c b/ompi/mca/bcol/iboffload/bcol_iboffload_allgather.c
new file mode 100644
index 0000000000..bd45246b28
--- /dev/null
+++ b/ompi/mca/bcol/iboffload/bcol_iboffload_allgather.c
@@ -0,0 +1,1385 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <inttypes.h>
+
+#include "bcol_iboffload.h"
+#include "bcol_iboffload_alltoall.h"
+#include "bcol_iboffload_bcast.h"
+#include "bcol_iboffload_frag.h"
+#include "bcol_iboffload_task.h"
+#include "bcol_iboffload_collreq.h"
+#include "bcol_iboffload_collfrag.h"
+#include "bcol_iboffload_endpoint.h"
+
+#include "opal/include/opal/types.h"
+
+static int mca_bcol_iboffload_allgather_init(
+                               bcol_function_args_t *fn_arguments,
+                               mca_bcol_iboffload_module_t *iboffload_module,
+                               mca_bcol_iboffload_collreq_t **coll_request,
+                               bool if_bcol_last, int mq_credits,
+                               collective_message_progress_function progress_fn)
+{
+    int rc;
+
+    ompi_free_list_item_t *item;
+    mca_bcol_iboffload_collfrag_t *coll_fragment;
+    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
+
+    OMPI_FREE_LIST_WAIT(&cm->collreqs_free, item, rc);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_ERROR(("Wait for free list failed.\n"));
+        return rc;
+    }
+    /* setup call request */
+    (*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
+
+    (*coll_request)->n_fragments  = 0;
+    (*coll_request)->n_frags_sent = 0;
+    (*coll_request)->n_frag_mpi_complete = 0;
+    (*coll_request)->n_frag_net_complete = 0;
+    (*coll_request)->if_bcol_last = if_bcol_last;
+    (*coll_request)->ml_buffer_index = fn_arguments->buffer_index;
+    (*coll_request)->completion_cb_fn = NULL;
+    (*coll_request)->buffer_info[SBUF].buf = (void *) (
+            (unsigned char *)fn_arguments->sbuf +
+            fn_arguments->sbuf_offset);
+    (*coll_request)->buffer_info[RBUF].buf = (void *) (
+        (unsigned char *)fn_arguments->rbuf +
+        fn_arguments->rbuf_offset);
+    (*coll_request)->buffer_info[SBUF].offset = fn_arguments->sbuf_offset;
+    (*coll_request)->buffer_info[RBUF].offset = fn_arguments->rbuf_offset;
+    /* seems like we should initialize the memory registration pointer to NULL here */
+    (*coll_request)->buffer_info[SBUF].iboffload_reg = NULL;
+    (*coll_request)->buffer_info[RBUF].iboffload_reg = NULL;
+    (*coll_request)->dtype = fn_arguments->dtype;
+    (*coll_request)->count = fn_arguments->count;
+    (*coll_request)->module = iboffload_module;
+    /* TODO Pasha: we need it for pending quque. Set it later. */
+    (*coll_request)->progress_fn = progress_fn;
+    /* TODO Pasha: fix it  later */
+    (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
+
+    (*coll_request)->order_info = &fn_arguments->order_info;
+
+    coll_fragment = &((*coll_request)->first_collfrag);
+    mca_bcol_iboffload_collfrag_init(coll_fragment);
+
+    /** Vasily ????? */
+    /* mq_credits = (*coll_request)->total_tasks_num; */
+    coll_fragment->mq_credits = mq_credits;
+    coll_fragment->mq_index = COLL_MQ;
+    /* pasha: just set it to zero */
+    coll_fragment->last_wait_num = 0;
+    coll_fragment->alg = -2; /* used only for debug */
+    /*
+    if (my_rank == algthm_ptr->root) {
+        coll_fragment->last_wait_num = 0;
+    } else {
+        coll_fragment->last_wait_num = algth_lst->last_wait_num;
+    }
+    */
+    /* Pasha: we have nothing to unpack */
+    coll_fragment->unpack_size = 0;
+    /* coll_fragment->unpack_size = pack_len; */
+    /* coll_fragment->alg = RECURSIVE_DOUBLING_TREE_BCAST; */
+
+    /* set pointers for (coll frag) <-> (coll full request) */
+    (*coll_request)->user_handle_freed = false;
+
+    fn_arguments->bcol_opaque_data = (void *) (*coll_request);
+    /*  We don't have root..
+    if (true == fn_arguments->root_flag) {
+        (*coll_request)->root = my_group_index;
+    } else {
+        (*coll_request)->root = fn_arguments->root_route->rank;
+    }
+    */
+
+    MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS((*coll_request), coll_fragment);
+    return OMPI_SUCCESS;
+}
+
+#if 1
+static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload)
+{
+    int i, j;
+    /*Seems that we don't require this*/
+    netpatterns_k_exchange_node_t *exchange_node = &iboffload->knomial_allgather_tree;
+
+    mca_bcol_iboffload_endpoint_t *ep;
+
+    IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));
+#if 0
+    fprintf(stderr,"Entering Open Connections\n");
+#endif
+
+    /* start with extras and proxy connections */
+    if(exchange_node->n_extra_sources > 0) {
+        /* connect to endpoint */
+        /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_extra_sources_array[0]]];*/
+        ep = iboffload->endpoints[exchange_node->rank_extra_sources_array[0]];
+         while (OMPI_SUCCESS !=
+                check_endpoint_state(ep, NULL, NULL)) {
+            opal_progress();
+        }
+    }
+    /* now move through the recursive k-ing exchanges */
+    if(NULL != exchange_node->rank_exchanges) {
+        for( i = 0; i < exchange_node->log_tree_order; i++) {
+            for( j = 0; j < ( exchange_node->tree_order - 1 ); j++) {
+                if( exchange_node->rank_exchanges[i][j] < 0 ){
+                    continue;
+                }
+                /* connect to endpoint */
+                /*ep = iboffload->endpoints[comm_to_ibnet[exchange_node->rank_exchanges[i][j]]];*/
+                ep = iboffload->endpoints[exchange_node->rank_exchanges[i][j]];
+                if (iboffload->ibnet->super.my_index < ep->index) {
+                    while(0 == (ep)->remote_zero_rdma_addr.addr) {
+                        opal_progress();
+                    }
+                } else {
+                    IBOFFLOAD_VERBOSE(10, ("Trying to connect - %d", ep->index));
+                    while (OMPI_SUCCESS !=
+                            check_endpoint_state(ep, NULL, NULL)) {
+                        opal_progress();
+                    }
+                }
+
+            }
+        }
+    }
+
+    /* set the connection status to connected */
+    iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true;
+}
+#endif
+
+
+static inline void bcol_iboffload_setup_allgather_ring_endpoints_connection(mca_bcol_iboffload_module_t *iboffload)
+{
+    int i;
+    const int group_size = iboffload->ibnet->super.group_size;
+    mca_bcol_iboffload_endpoint_t *ep;
+
+    IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));
+
+    /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be
+     *
+     */
+
+     /* I'm going to leave this alone for now, because I'm
+      *  not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids?
+      */
+    for (i = 0; i < group_size; i++) {
+        ep = iboffload->endpoints[i];
+        while (OMPI_SUCCESS !=
+                check_endpoint_state(ep, NULL, NULL)) {
+            opal_progress();
+        }
+    }
+
+    /* set the connection status to connected */
+
+    /*JSL - change this macro */
+    iboffload->connection_status[ALLGATHER_NEIGHBOR_ALG] = true;
+}
+
+#if 0
+/* allgather neighbor exchange algorithm N/2 communication steps, 2 connections */
+static int mca_bcol_iboffload_neighbor_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module,
+                                                   mca_bcol_iboffload_collreq_t *coll_request)
+{
+    int rc,
+        src, dst;
+
+    uint32_t pack_len;
+    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
+    int group_size = iboffload_module->group_size;
+    int step, roffset, soffset;
+    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
+    int even_rank;
+    int parity;
+
+    struct mqe_task *last_send = NULL,
+                    *last_wait = NULL;
+    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;
+
+#if 0
+    fprintf(stderr,"entering large msg neighbor exchange allgather\n");
+#endif
+    IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather"));
+    if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_NEIGHBOR_ALG])) {
+        IBOFFLOAD_VERBOSE(10,("Allgather open new connection "));
+        bcol_iboffload_setup_allgather_ring_endpoints_connection(iboffload_module);
+    }
+
+    pack_len = coll_request->count * coll_request->dtype->super.size;
+    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
+                pack_len,
+                coll_request->count,
+                coll_request->dtype->super.size));
+
+    /* register send and receive sides */
+    /* send side, only sending pack_len data */
+
+    /* I think that probably I will only register the rbuf */
+    /* on receive side I need to register pack_len*group_size data */
+    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size,
+            &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_ERROR(("Cannot register memory: "
+                         "addr - %p, %d bytes.\n",
+                          coll_request->buffer_info[RBUF].buf, pack_len));
+        return OMPI_ERROR;
+    }
+    coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey;
+
+    /* it is estimated mq consumption... */
+    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
+                    iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
+        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
+        goto out_of_resources;
+    }
+
+    coll_fragment->tail_next = &coll_fragment->to_post;
+
+
+    /* start the neighbor exchange */
+
+    even_rank = !(my_group_index % 2);
+    if (even_rank) {
+        neighbor[0] = (my_group_index + 1) % group_size;
+        neighbor[1] = (my_group_index - 1 + group_size) % group_size;
+        recv_data_from[0] = my_group_index;
+        recv_data_from[1] = my_group_index;
+        offset_at_step[0] = (+2);
+        offset_at_step[1] = (-2);
+    } else {
+        neighbor[0] = (my_group_index - 1 + group_size) % group_size;
+        neighbor[1] = (my_group_index + 1) % group_size;
+        recv_data_from[0] = neighbor[0];
+        recv_data_from[1] = neighbor[0];
+        offset_at_step[0] = (-2);
+        offset_at_step[1] = (+2);
+    }
+
+    /* first step is special step, only send one block */
+    roffset = neighbor[0]*pack_len;
+    soffset = my_group_index*pack_len;
+    /* send receive this */
+
+    dst = neighbor[0];
+    src = neighbor[0];
+
+    rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
+            src, iboffload_module,
+            coll_fragment);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
+        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+            goto out_of_resources;
+        }
+        return OMPI_ERROR;
+    }
+
+
+    rc = mca_bcol_iboffload_recv_rtr_setup(
+            &last_wait, dst, iboffload_module, coll_fragment);
+    /* send the data */
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("Failed to"
+                    "mca_bcol_iboffload_recv_rtr_setup"));
+        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+            goto out_of_resources;
+        }
+        return OMPI_ERROR;
+    }
+
+    rc = mca_bcol_iboffload_send_large_buff_setup(
+            &last_send, RBUF,
+            coll_request->buffer_info[RBUF].offset +
+            soffset/* offset calc */ ,
+            pack_len, dst,
+            iboffload_module, coll_fragment);
+
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("Failed to"
+                    "mca_bcol_iboffload_send_large_buff_setup"));
+        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+            goto out_of_resources;
+        }
+        return OMPI_ERROR;
+    }
+    /* send is done */
+
+
+
+    rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF,
+            coll_request->buffer_info[RBUF].offset +
+            roffset,
+            pack_len, src,
+            iboffload_module, coll_fragment);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
+        if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+            goto out_of_resources;
+        }
+        return OMPI_ERROR;
+    }
+
+    /* now for the actual neighbor exchange algorithm */
+
+
+    /* determine initial send location */
+    if(even_rank) {
+        send_data_from = my_group_index;
+    }else {
+        send_data_from = recv_data_from[0];
+    }
+    for( step = 1; step < (group_size/2); step++) {
+
+        parity = step % 2;
+        recv_data_from[parity] =
+            (recv_data_from[parity] + offset_at_step[parity] + group_size) % group_size;
+        src = neighbor[parity];
+        dst = src;
+
+        roffset = recv_data_from[parity] * pack_len;
+        soffset = send_data_from * pack_len;
+
+        /* post send rtr and recev rtr together */
+        if( 1 == step ){
+            rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
+                    src, iboffload_module,
+                    coll_fragment);
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+
+            rc = mca_bcol_iboffload_recv_rtr_setup(
+                    &last_wait, dst, iboffload_module, coll_fragment);
+            /* send the data */
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to"
+                            "mca_bcol_iboffload_recv_rtr_setup"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+        }
+
+
+        /* I'm using the hierarchy offset used in the k-nomial allgather */
+        /* this won't work...*/
+        rc = mca_bcol_iboffload_send_large_buff_setup(
+                &last_send, RBUF,
+                coll_request->buffer_info[RBUF].offset +
+                soffset/* offset calc */ ,
+                2 * pack_len, dst,
+                iboffload_module, coll_fragment);
+
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to"
+                        "mca_bcol_iboffload_send_large_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+        /* send is done */
+
+
+        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF,
+                coll_request->buffer_info[RBUF].offset +
+                roffset,
+                2 * pack_len, src,
+                iboffload_module, coll_fragment);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+        send_data_from = recv_data_from[parity];
+
+    }
+
+    /* end of list */
+    *coll_fragment->tail_next = NULL;
+
+    /* finish initializing full message descriptor */
+    (coll_request)->n_fragments  = 1;
+    (coll_request)->n_frags_sent = 1;
+
+    assert(NULL != last_wait);
+    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
+    coll_fragment->signal_task_wr_id = last_wait->wr_id;
+    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
+
+    assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index);
+    /* post the mwr */
+    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
+        /* Note: need to clean up */
+        return rc;
+    }
+
+    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);
+
+    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
+    return BCOL_FN_STARTED;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n"));
+    rc =
+        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
+    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
+}
+#endif
+
+#if 0
+/* debug connection routine */
+static inline void bcol_iboffload_setup_allgather_endpoints_connection(mca_bcol_iboffload_module_t *iboffload)
+{
+    int i;
+    const int group_size = iboffload->ibnet->super.group_size;
+    mca_bcol_iboffload_endpoint_t *ep;
+
+    IBOFFLOAD_VERBOSE(10, ("Open connections.\n"));
+
+    /* this is algorithm specific - need to move through the algorithm here basically to set up connections, should be
+     *
+     */
+
+     /* I'm going to leave this alone for now, because I'm
+      *  not sure how these endpoints map back to ibnet. Is it mapped to ibnet ids or to communicator ids?
+      */
+    for (i = 0; i < group_size; i++) {
+        ep = iboffload->endpoints[i];
+        while (OMPI_SUCCESS !=
+                check_endpoint_state(ep, NULL, NULL)) {
+            opal_progress();
+        }
+    }
+
+    /* set the connection status to connected */
+
+    /*JSL - change this macro */
+    iboffload->connection_status[ALLGATHER_KNOMIAL_ALG] = true;
+}
+#endif
+
+static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module,
+                                                   mca_bcol_iboffload_collreq_t *coll_request)
+{
+    int rc,
+        src, dst, comm_dst, comm_src;
+    int tree_order, pow_k, i, j;
+
+    uint32_t pack_len;
+    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
+    int group_size = iboffload_module->group_size;
+    int *group_list = iboffload_module->super.sbgp_partner_module->group_list;
+    int my_comm_index = group_list[my_group_index];
+
+    netpatterns_k_exchange_node_t *exchange_node = &iboffload_module->knomial_allgather_tree;
+
+    struct mqe_task *last_send = NULL,
+                    *last_wait = NULL;
+    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;
+
+#if 0
+    fprintf(stderr,"entering large msg allgather\n");
+#endif
+    IBOFFLOAD_VERBOSE(10,("Entering large msg iboffload allgather"));
+    if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) {
+        IBOFFLOAD_VERBOSE(10,("Allgather open new connection "));
+        bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module);
+    }
+
+    pack_len = coll_request->count * coll_request->dtype->super.size;
+    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
+                pack_len,
+                coll_request->count,
+                coll_request->dtype->super.size));
+
+    /* register send and receive sides */
+    /* send side, only sending pack_len data */
+
+    /* I think that probably I will only register the rbuf */
+    /* on receive side I need to register pack_len*group_size data */
+
+    rc = mca_bcol_iboffload_prepare_buffer(coll_request->buffer_info[RBUF].buf, pack_len * group_size,
+            &coll_request->buffer_info[RBUF].iboffload_reg, iboffload_module);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_ERROR(("Cannot register memory: "
+                         "addr - %p, %d bytes.\n",
+                          coll_request->buffer_info[RBUF].buf, pack_len));
+        return OMPI_ERROR;
+    }
+    coll_request->buffer_info[RBUF].lkey = coll_request->buffer_info[RBUF].iboffload_reg->mr->lkey;
+
+    /* it is estimated mq consumption... */
+    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
+                    iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
+        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
+        goto out_of_resources;
+    }
+
+    coll_fragment->tail_next = &coll_fragment->to_post;
+
+    /* start with the extra / proxy phase */
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+
+        /* send pack_len data to proxy */
+        comm_dst = exchange_node->rank_extra_sources_array[0];
+        /* get ib subnet id */
+        dst = comm_dst; /* comm_to_ibnet[comm_dst];*/
+        /* post ready-to-receive receive on sender's side */
+        rc = mca_bcol_iboffload_recv_rtr_setup(
+                &last_wait, dst, iboffload_module, coll_fragment);
+
+        /* send the data */
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to"
+                        "mca_bcol_iboffload_recv_rtr_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+
+        rc = mca_bcol_iboffload_send_large_buff_setup(
+                &last_send, RBUF, coll_request->buffer_info[RBUF].offset + my_comm_index*pack_len,
+                pack_len, dst,
+                iboffload_module, coll_fragment);
+
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to"
+                        "mca_bcol_iboffload_send_large_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+        /* send is done */
+
+        /* post the receive */
+        comm_src = comm_dst;
+        src = dst;
+        /* Sending this results in a race condition where if the rtr send bypasses
+           the large msg receive on proxy's side, then it triggers the start of the
+           recurssive k-ing phase prematurely causing random data corruption.
+          */
+       /*
+        rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
+                                    src, iboffload_module,
+                                    coll_fragment);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+        */
+        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
+                RBUF, coll_request->buffer_info[RBUF].offset,
+                pack_len*group_size, src,
+                iboffload_module, coll_fragment);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+
+        goto FINISHED;
+
+
+    } else if( 0 < exchange_node->n_extra_sources ) {
+
+        /* am a proxy, receive pack_len data from extra */
+        comm_src = exchange_node->rank_extra_sources_array[0];
+        /* get ib subnet */
+        src =  comm_src; /*comm_to_ibnet[comm_src];*/
+
+        rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
+                                    src, iboffload_module,
+                                    coll_fragment);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+
+
+        rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait,
+                RBUF, coll_request->buffer_info[RBUF].offset + pack_len*comm_src,
+                pack_len, src,
+                iboffload_module, coll_fragment);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+
+    }
+
+    /* start recursive k - ing */
+    tree_order = exchange_node->tree_order;
+    pow_k =  exchange_node->log_tree_order;
+    for( i = 0; i < pow_k; i++) {
+
+
+        /* Post ready-to-recv messages - I am here */
+        for( j = 0; j <( tree_order - 1); j++) {
+            comm_src = exchange_node->rank_exchanges[i][j];
+            if( comm_src < 0 ){
+                continue;
+            }
+            /* get ib subnet */
+            src = comm_src; /*comm_to_ibnet[comm_src];*/
+
+            rc = mca_bcol_iboffload_send_rtr_setup(&last_send,
+                    src, iboffload_module,
+                    coll_fragment);
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_send_rtr_setup"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+        }
+
+        /* Post receive ready-to-recev message - I can send to you */
+        for( j = 0; j < (tree_order - 1); j++) {
+            /* recev ready-to-receive message */
+            comm_dst = exchange_node->rank_exchanges[i][j];
+            /* remember, if we have extra ranks, then we won't participate
+             * with a least one peer. Make a check:
+             */
+            if( comm_dst < 0 ){
+                continue;
+            }
+
+            /* get ib subnet id */
+            dst = comm_dst; /*comm_to_ibnet[comm_dst];*/
+            /* post ready-to-receive receive on sender's side */
+            rc = mca_bcol_iboffload_recv_rtr_setup(
+                    &last_wait, dst, iboffload_module, coll_fragment);
+            /* send the data */
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to"
+                            "mca_bcol_iboffload_recv_rtr_setup"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+        }
+
+
+        /* (k-1) sends */
+        for( j = 0; j < (tree_order - 1); j++ ) {
+
+            /* send phase
+             */
+            comm_dst = exchange_node->rank_exchanges[i][j];
+            /* remember, if we have extra ranks, then we won't participate
+             * with a least one peer. Make a check
+             */
+            if( comm_dst < 0 ){
+                continue;
+            }
+
+            /* get ib subnet id */
+            dst = comm_dst; /*comm_to_ibnet[comm_dst];*/
+            rc = mca_bcol_iboffload_send_large_buff_setup(
+                    &last_send, RBUF,
+                    coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].s_offset/* offset calc */ ,
+                    exchange_node->payload_info[i][j].s_len*pack_len, dst,
+                    iboffload_module, coll_fragment);
+
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to"
+                            "mca_bcol_iboffload_send_large_buff_setup"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+            /* send is done */
+
+        }
+
+        /* we post receives after all sends in order to achieve concurrent
+         * sends as well as assuring blocking until completely receiving
+         * all data at level k before starting level k+1 sends
+         */
+        /* (k-1) receives - these are blocking */
+        for( j = 0; j < (tree_order - 1); j++) {
+            /*recv phase */
+            comm_src = exchange_node->rank_exchanges[i][j];
+            if( comm_src < 0 ){
+                continue;
+            }
+            /* get ib subnet */
+            src = comm_src; /*comm_to_ibnet[comm_src];*/
+
+            rc = mca_bcol_iboffload_recv_large_buff_setup(&last_wait, RBUF,
+                    coll_request->buffer_info[RBUF].offset + pack_len*exchange_node->payload_info[i][j].r_offset,
+                    exchange_node->payload_info[i][j].r_len*pack_len, src,
+                    iboffload_module, coll_fragment);
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to mca_bcol_iboffload_recv_large_buff_setup"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+
+
+
+        }
+
+
+    }
+
+    /* last step, just send it back to the extra if I have one */
+    if( 0 < exchange_node->n_extra_sources ) {
+
+        comm_dst = exchange_node->rank_extra_sources_array[0];
+
+        /* get ib subnet id */
+        dst = comm_dst; /*comm_to_ibnet[comm_dst];*/
+        /*
+        rc = mca_bcol_iboffload_recv_rtr_setup(
+                &last_wait, dst, iboffload_module, coll_fragment);
+
+        // send the data
+         we are already guaranteed that extra rank is waiting
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to"
+                        "mca_bcol_iboffload_recv_rtr_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+        */
+
+        rc = mca_bcol_iboffload_send_large_buff_setup(
+                &last_send, RBUF, coll_request->buffer_info[RBUF].offset,
+                pack_len*group_size, dst,
+                iboffload_module, coll_fragment);
+
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to"
+                        "mca_bcol_iboffload_send_large_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+        /* send is done */
+
+    }
+
+FINISHED:
+
+    /* end of list */
+    *coll_fragment->tail_next = NULL;
+
+    /* finish initializing full message descriptor */
+    (coll_request)->n_fragments  = 1;
+    (coll_request)->n_frags_sent = 1;
+
+    assert(NULL != last_wait);
+    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
+    coll_fragment->signal_task_wr_id = last_wait->wr_id;
+    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
+
+    assert(MCA_COLL_ML_NO_BUFFER == coll_request->ml_buffer_index);
+    /* post the mwr */
+    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
+        /* Note: need to clean up */
+        return rc;
+    }
+
+    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);
+
+    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
+    return BCOL_FN_STARTED;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n"));
+    rc =
+        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
+    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
+}
+
+static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec(mca_bcol_iboffload_module_t *iboffload_module,
+                                                   mca_bcol_iboffload_collreq_t *coll_request)
+{
+    int rc,
+        src, dst, comm_dst, comm_src, i, j;
+    int tree_order, pow_k, knt;
+    uint32_t pack_len;
+    int my_group_index = iboffload_module->super.sbgp_partner_module->my_index;
+    int group_size = iboffload_module->group_size;
+    netpatterns_k_exchange_node_t *exchange_node =
+                                    &iboffload_module->knomial_allgather_tree;
+
+    struct mqe_task *last_send = NULL,
+                    *last_wait = NULL;
+    mca_bcol_iboffload_collfrag_t *coll_fragment = &coll_request->first_collfrag;
+    int *list_connected = iboffload_module->super.list_n_connected;
+
+    /* test test */
+    int buff_offset = iboffload_module->super.hier_scather_offset;
+
+    IBOFFLOAD_VERBOSE(10,("Entering small msg iboffload bcast"));
+
+
+    if (OPAL_UNLIKELY(!iboffload_module->connection_status[ALLGATHER_KNOMIAL_ALG])) {
+        IBOFFLOAD_VERBOSE(10,("Bcast open new connection "));
+        bcol_iboffload_setup_allgather_endpoints_connection(iboffload_module);
+    }
+
+    pack_len = coll_request->count * coll_request->dtype->super.size;
+    IBOFFLOAD_VERBOSE(10,("My packet length %d pack_len frag_count %d dtype size %d ",
+                            pack_len,
+                            coll_request->count,
+                            coll_request->dtype->super.size));
+
+    /* now we calculate the actual buff_offset */
+    buff_offset = buff_offset*pack_len;
+
+    /* it is estimated mq consumption... */
+    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
+                 iboffload_module, coll_fragment->mq_index, coll_fragment->mq_credits))) {
+        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
+        goto out_of_resources;
+    }
+
+    coll_fragment->tail_next = &coll_fragment->to_post;
+    /* we put this in to propagate the lkey into this local data structure */
+    coll_request->buffer_info[SBUF].lkey = iboffload_module->rdma_block.ib_info.lkey;
+    /* end hack */
+    if( EXTRA_NODE == exchange_node->node_type ) {
+        /* setup the rdma "send" pack_len data to proxy rank */
+        comm_dst = exchange_node->rank_extra_sources_array[0];
+        /* get ib subnet id */
+        dst = comm_dst;
+        /* now I need to calculate my own offset info */
+        knt = 0;
+        for( i = 0; i < my_group_index; i++){
+            knt += list_connected[i];
+        }
+
+        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                &last_send, pack_len*list_connected[my_group_index],  pack_len*knt /* source offset */,
+                pack_len*knt /* destination offset */, dst,
+                iboffload_module, coll_fragment);
+#if 0
+        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                &last_send, pack_len,  pack_len*group_list[my_group_index] /* source offset */,
+                pack_len*group_list[my_group_index] /* destination offset */, dst,
+                iboffload_module, coll_fragment);
+#endif
+        /* old flow with ml offset */
+#if 0
+        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                &last_send, pack_len,  pack_len*group_list[my_group_index] /* source offset */,
+                coll_request->buffer_info[RBUF].offset + pack_len*group_list[my_group_index] /* destination offset */, dst,
+                iboffload_module, coll_fragment);
+#endif
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to"
+                        " mca_bcol_iboffload_send_small_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+        /* send is done */
+
+        /* setup the rdma "receive" from proxy */
+        comm_src = comm_dst;
+        src = dst;
+        /* more general is the number connected */
+        knt = 0;
+        for( i = 0; i < group_size; i++) {
+            knt += list_connected[i];
+        }
+
+
+        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
+                                    pack_len*knt, src,
+                                    iboffload_module, coll_fragment);
+
+       /*
+        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
+                                    pack_len*group_size, src,
+                                    iboffload_module, coll_fragment);
+        */
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+
+        goto FINISHED;
+    } else if( 0 < exchange_node->n_extra_sources ) {
+
+        /* am a proxy, receive pack_len data from extra */
+        comm_src = exchange_node->rank_extra_sources_array[0];
+        /* get ib subnet */
+        src = comm_src;
+        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
+                                    pack_len*list_connected[src], src,
+                                    iboffload_module, coll_fragment);
+        /*
+        rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
+                                    pack_len, src,
+                                    iboffload_module, coll_fragment);
+        */
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+            }
+            return OMPI_ERROR;
+        }
+
+
+    }
+
+    /* start recursive k - ing */
+    tree_order = exchange_node->tree_order;
+    pow_k =  exchange_node->log_tree_order;
+    /*fprintf(stderr,"tree order %d pow_k %d\n",tree_order,pow_k);*/
+    for( i = 0; i < pow_k; i++) {
+        for( j = 0; j < (tree_order - 1); j++ ) {
+            /* send phase
+             */
+            comm_dst = exchange_node->rank_exchanges[i][j];
+            /* remember, if we have extra ranks, then we won't participate
+             * with a least one peer. Make a check
+             */
+            /*fprintf(stderr,"AAA my index %d comm_dst %d\n",my_group_index,comm_dst);*/
+            if( comm_dst < 0 ){
+                continue;
+            }
+
+            /* get ib subnet id */
+            /* again, don't think we need this */
+            /*dst = ibnet_map[comm_dst];*/
+            dst = comm_dst;
+            /*
+            fprintf(stderr,"BBB my index %d dst %d pack len %d s_len %d src offset %d r_len %d \n",my_group_index,dst,
+                    pack_len,exchange_node->payload_info[i][j].s_len,exchange_node->payload_info[i][j].s_offset,
+                    exchange_node->payload_info[i][j].r_len);
+            */
+            /* rdma "send" setup */
+
+
+            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
+                    exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */,
+                    exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst,
+                    iboffload_module, coll_fragment);
+
+#if 0
+            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
+                    exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* source offset */,
+                    exchange_node->payload_info[i][j].s_offset * exchange_node->payload_info[i][j].s_len*pack_len /* destination offset */, dst,
+                    iboffload_module, coll_fragment);
+#endif
+
+#if 0
+            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
+                    exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */,
+                    exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst,
+                    iboffload_module, coll_fragment);
+#endif
+#if 0
+            rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                    &last_send, exchange_node->payload_info[i][j].s_len * pack_len,
+                    coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* source offset */,
+                    coll_request->buffer_info[SBUF].offset + exchange_node->payload_info[i][j].s_offset * pack_len /* destination offset */, dst,
+                    iboffload_module, coll_fragment);
+#endif
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to"
+                            " mca_bcol_iboffload_send_small_buff_setup"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+
+            /* send is done */
+        }
+
+       for( j = 0; j < (tree_order - 1); j++) {
+
+            /* rdma "recv" phase */
+           comm_src = exchange_node->rank_exchanges[i][j];
+           /* remember, if we have extra ranks, then we won't participate
+            * with a least one peer. Make a check
+            */
+           if( comm_src < 0 ){
+               continue;
+           }
+
+           /* get ib subnet id */
+           /* shouldn't need this */
+           src = comm_src;
+
+           rc = mca_bcol_iboffload_recv_small_buff_setup(&last_wait,
+                    exchange_node->payload_info[i][j].r_len * pack_len, src,
+                    iboffload_module, coll_fragment);
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                IBOFFLOAD_VERBOSE(10, ("Failed to setup data receive"));
+                if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                    goto out_of_resources;
+                }
+                return OMPI_ERROR;
+            }
+
+        }
+    }
+
+    /* last step, proxies send full data back to the extra ranks */
+    if( 0 < exchange_node->n_extra_sources ) {
+        /* send pack_len data to proxy */
+        comm_dst = exchange_node->rank_extra_sources_array[0];
+        /* get ibnet id */
+        dst = comm_dst;
+
+        knt = 0;
+        for( i = 0; i < group_size; i++){
+            knt += list_connected[i];
+        }
+
+        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                &last_send, pack_len*knt, 0 /* source offset */,
+                0 /* destination offset */, dst,
+                iboffload_module, coll_fragment);
+#if 0
+        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                &last_send, pack_len*group_size, 0 /* source offset */,
+                0 /* destination offset */, dst,
+                iboffload_module, coll_fragment);
+#endif
+#if 0
+        rc = mca_bcol_iboffload_rdma_write_imm_small_buff_setup(
+                &last_send, pack_len*group_size, coll_request->buffer_info[RBUF].offset /* source offset */,
+                coll_request->buffer_info[SBUF].offset /* destination offset */, dst,
+                iboffload_module, coll_fragment);
+#endif
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+            IBOFFLOAD_VERBOSE(10, ("Failed to"
+                        " mca_bcol_iboffload_send_small_buff_setup"));
+            if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc){
+                goto out_of_resources;
+                fprintf(stderr,"I'm out of resources \n");
+            }
+            return OMPI_ERROR;
+        }
+        /* send is done */
+
+    }
+
+FINISHED:
+
+    /* end of list */
+    *coll_fragment->tail_next = NULL;
+
+    /* finish initializing full message descriptor */
+    (coll_request)->n_fragments  = 1;
+    (coll_request)->n_frags_sent = 1;
+
+    assert(NULL != last_wait);
+    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
+    coll_fragment->signal_task_wr_id = last_wait->wr_id;
+    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
+
+    assert(MCA_COLL_ML_NO_BUFFER != coll_request->ml_buffer_index);
+    /* post the mwr */
+    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload_module, coll_fragment->to_post);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("MQE task posting failing.\n"));
+        /* Note: need to clean up */
+        return rc;
+    }
+
+    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload_module->super, coll_request->order_info);
+
+    IBOFFLOAD_VERBOSE(10, ("Return success.\n"));
+    return BCOL_FN_STARTED;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Allgather, adding collfrag to collfrag_pending.\n"));
+    rc =
+        mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload_module);
+    return (OMPI_SUCCESS != rc) ? BCOL_FN_NOT_STARTED : BCOL_FN_STARTED;
+}
+
+#if 0
+static int mca_bcol_iboffload_neighbor_allgather_userbuffer_intra(
+                                            bcol_function_args_t *fn_arguments,
+                                            struct coll_ml_function_t *const_args)
+{
+    mca_bcol_iboffload_module_t *iboffload_module =
+        (mca_bcol_iboffload_module_t *)const_args->bcol_module;
+
+    int rc;
+    int mq_credits = iboffload_module->group_size * 2 * 2; /* large message protocol consumes
+                                                            * twice as many mq credits
+                                                            */
+
+    bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args);
+    mca_bcol_iboffload_collreq_t *coll_request;
+
+    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);
+
+    rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module,
+            &coll_request, if_bcol_last, mq_credits,
+            mca_bcol_iboffload_neighbor_allgather_userbuffer_exec);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
+
+    rc = coll_request->progress_fn(iboffload_module, coll_request);
+
+    IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc));
+    return rc;
+}
+#endif
+
+#if 1
+static int mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra(bcol_function_args_t *fn_arguments,
+                                                   struct coll_ml_function_t *const_args)
+{
+    mca_bcol_iboffload_module_t *iboffload_module =
+        (mca_bcol_iboffload_module_t *)const_args->bcol_module;
+
+    int rc;
+    int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)*
+                       iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 * 2; /* large message protocol
+                                                                                              * consumes twice as much
+                                                                                              */
+
+    bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args);
+    mca_bcol_iboffload_collreq_t *coll_request;
+
+    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);
+
+    rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module,
+            &coll_request, if_bcol_last, mq_credits,
+            mca_bcol_iboffload_k_nomial_allgather_userbuffer_exec);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
+
+    rc = coll_request->progress_fn(iboffload_module, coll_request);
+
+    IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra was started [%d]\n", rc));
+    return rc;
+}
+#endif
+
+static int mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra(bcol_function_args_t *fn_arguments,
+                                                   struct coll_ml_function_t *const_args)
+{
+    mca_bcol_iboffload_module_t *iboffload_module =
+        (mca_bcol_iboffload_module_t *)const_args->bcol_module;
+
+    int rc;
+
+    /* I'll add one for everyone, since nobody wants to feel left out */
+    int mq_credits = ((iboffload_module->knomial_allgather_tree.tree_order - 1)*
+                       iboffload_module->knomial_allgather_tree.log_tree_order + 1) * 2 ;
+    bool if_bcol_last = BCOL_IBOFFLOAD_IS_LAST_CALL(const_args);
+    mca_bcol_iboffload_collreq_t *coll_request;
+
+    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);
+
+    rc = mca_bcol_iboffload_allgather_init(fn_arguments, iboffload_module,
+            &coll_request, if_bcol_last, mq_credits,
+            mca_bcol_iboffload_k_nomial_allgather_mlbuffer_exec);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
+
+    rc = coll_request->progress_fn(iboffload_module, coll_request);
+
+    IBOFFLOAD_VERBOSE(10, ("mca_bcol_iboffload_small_msg_bcast_intra was started [%d]\n", rc));
+    return rc;
+}
+
+
+/* these progress engines are shared between alltoall and allgather and exist in both files,
+ * should be moved to a common .h file
+ */
+static int mca_bcol_iboffload_collreq_mlbuffer_progress(
+            bcol_function_args_t *input_args,
+            struct coll_ml_function_t *const_args)
+{
+    int i;
+    mca_bcol_iboffload_collreq_t *coll_request =
+         (mca_bcol_iboffload_collreq_t *)
+                   input_args->bcol_opaque_data;
+    IBOFFLOAD_VERBOSE(10, ("Run progress (ml buffer).\n"));
+    for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) {
+    if (BCOL_IS_COMPLETED(coll_request)) {
+
+        coll_request->user_handle_freed = true;
+
+        if (COLLREQ_IS_DONE(coll_request)) {
+        IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
+        RELEASE_COLLREQ(coll_request);
+        }
+        IBOFFLOAD_VERBOSE(10, ("Collective finished (ml buffer).\n"));
+
+        return BCOL_FN_COMPLETE;
+    }
+    }
+    IBOFFLOAD_VERBOSE(10, ("Collective not finished (ml buffer).\n"));
+    return BCOL_FN_STARTED;
+}
+
+
+static int mca_bcol_iboffload_collreq_userbuffer_progress(
+                        bcol_function_args_t *input_args,
+                        struct coll_ml_function_t *const_args)
+{
+    int i;
+    mca_bcol_iboffload_collreq_t *coll_request =
+                 (mca_bcol_iboffload_collreq_t *)
+                                   input_args->bcol_opaque_data;
+
+    IBOFFLOAD_VERBOSE(10, ("Run progress (user buffer)\n"));
+
+    /* Complete the allgather - progress releases full request descriptors */
+
+    for (i = 0; i < mca_bcol_iboffload_component.max_progress_pull; i++) {
+        if (coll_request->n_frag_mpi_complete == coll_request->n_fragments &&
+            coll_request->n_frag_net_complete == coll_request->n_fragments) {
+
+            IBOFFLOAD_VERBOSE(10, ("Deregister user buff.\n"));
+
+            if (NULL != coll_request->buffer_info[SBUF].iboffload_reg) {
+                coll_request->module->device->mpool->mpool_deregister(
+                        coll_request->module->device->mpool,
+                        (mca_mpool_base_registration_t *) coll_request->buffer_info[SBUF].iboffload_reg);
+                coll_request->buffer_info[SBUF].iboffload_reg = NULL;
+            }
+
+
+            if (NULL != coll_request->buffer_info[RBUF].iboffload_reg) {
+                coll_request->module->device->mpool->mpool_deregister(
+                        coll_request->module->device->mpool,
+                        (mca_mpool_base_registration_t *) coll_request->buffer_info[RBUF].iboffload_reg);
+                coll_request->buffer_info[RBUF].iboffload_reg = NULL;
+            }
+
+            RELEASE_COLLREQ(coll_request);
+            IBOFFLOAD_VERBOSE(10, ("New bcast done !!!"));
+            return BCOL_FN_COMPLETE;
+        }
+    }
+
+    IBOFFLOAD_VERBOSE(10, ("Collective finished (user buffer).\n"));
+
+    /* We are not done */
+    return BCOL_FN_STARTED;
+}
+
+int mca_bcol_iboffload_allgather_register(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    IBOFFLOAD_VERBOSE(10, ("Register iboffload Allgather.\n"));
+    comm_attribs.bcoll_type = BCOL_ALLGATHER;
+
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 1024 * 1024;
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000; /* range 1 */
+
+    inv_attribs.datatype_bitmap = 0xffffffff;
+    inv_attribs.op_types_bitmap = 0xffffffff;
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+
+    mca_bcol_base_set_attributes(super,
+            &comm_attribs, &inv_attribs,
+            mca_bcol_iboffload_k_nomial_allgather_mlbuffer_intra,
+            mca_bcol_iboffload_collreq_mlbuffer_progress);
+
+    inv_attribs.bcol_msg_min = 10000000;
+    inv_attribs.bcol_msg_max = 10485760; /* range 4 */
+
+
+    /* zero-copy k-nomial algorithm */
+#if 1
+    mca_bcol_base_set_attributes(super,
+            &comm_attribs, &inv_attribs,
+            mca_bcol_iboffload_k_nomial_allgather_userbuffer_intra,
+            mca_bcol_iboffload_collreq_userbuffer_progress);
+#endif
+    /* zero-copy neighbor exchange algorithm */
+#if 0
+    mca_bcol_base_set_attributes(super,
+            &comm_attribs, &inv_attribs,
+            mca_bcol_iboffload_neighbor_allgather_userbuffer_intra,
+            mca_bcol_iboffload_collreq_userbuffer_progress);
+#endif
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_allreduce.c b/ompi/mca/bcol/iboffload/bcol_iboffload_allreduce.c
new file mode 100644
index 0000000000..93f6e67d79
--- /dev/null
+++ b/ompi/mca/bcol/iboffload/bcol_iboffload_allreduce.c
@@ -0,0 +1,1415 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/** @file */
+
+#include "ompi_config.h"
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <inttypes.h>
+
+#include "bcol_iboffload.h"
+#include "bcol_iboffload_frag.h"
+#include "bcol_iboffload_task.h"
+#include "bcol_iboffload_collfrag.h"
+#include "bcol_iboffload_endpoint.h"
+
+#include "opal/include/opal/types.h"
+
+static int mca_bcol_iboffload_calc_res_to_user(void *callback_data)
+{
+    int rc;
+    uint64_t result = 0;
+
+    uint64_t l_operand = 0;
+    uint64_t r_operand = 0;
+
+    mca_bcol_iboffload_collfrag_t *coll_frag =
+                        (mca_bcol_iboffload_collfrag_t *) callback_data;
+
+    mca_bcol_iboffload_collreq_t *coll_request = coll_frag->coll_full_req;
+
+    ompi_op_t *op = coll_request->op;
+    ompi_datatype_t *dtype = coll_request->dtype;
+
+    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
+    struct ibv_context *ib_dev_context = coll_request->module->device->dev.ib_dev_context;
+
+    IBOFFLOAD_VERBOSE(10, ("Start calculating.\n"));
+
+    rc = unpack_data_from_calc(ib_dev_context,
+                               cm->map_ompi_to_ib_calcs[op->op_type],
+                               cm->map_ompi_to_ib_dt[dtype->id], false,
+                               (void *) (uintptr_t) coll_request->l_operand,
+                               NULL, (void *) &l_operand);
+    if (0 != rc) {
+        IBOFFLOAD_VERBOSE(10, ("unpack_data_from_calc for l_operand failed: op %s, type %s\n",
+                                op->o_name, dtype->name));
+        return OMPI_ERROR;
+    }
+
+    rc = unpack_data_from_calc(ib_dev_context,
+                               cm->map_ompi_to_ib_calcs[op->op_type],
+                               cm->map_ompi_to_ib_dt[dtype->id], false,
+                               (void *) (uintptr_t) coll_request->r_operand,
+                               NULL, (void *) &r_operand);
+    if (0 != rc) {
+        IBOFFLOAD_VERBOSE(10, ("unpack_data_from_calc for r_operand failed: op %s, type %s\n",
+                                op->o_name, dtype->name));
+        return OMPI_ERROR;
+    }
+
+    switch (op->op_type) {
+        case OMPI_OP_PROD:
+                break; /* ronni todo - ????? */
+        case OMPI_OP_LAND:
+            result = l_operand && r_operand;
+            break;
+        case OMPI_OP_BAND:
+            result = l_operand & r_operand;
+            break;
+        case OMPI_OP_LOR:
+            result = l_operand || r_operand;
+            break;
+        case OMPI_OP_BOR:
+            result = l_operand | r_operand;
+            break;
+        case OMPI_OP_LXOR:
+            result = ((l_operand && !r_operand) || (!l_operand && r_operand));
+            break;
+        case OMPI_OP_BXOR:
+            result = l_operand ^ r_operand;
+            break;
+        case OMPI_OP_MAXLOC:
+        case OMPI_OP_MINLOC:
+            break;
+        case OMPI_OP_MAX:
+        case OMPI_OP_MIN:
+        case OMPI_OP_SUM:
+            switch (cm->map_ompi_to_ib_dt[dtype->id]) {
+                case IBV_M_DATA_TYPE_INT8:
+                    MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, char, l_operand, r_operand, result);
+                    break;
+                case IBV_M_DATA_TYPE_INT16:
+                    MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, int16_t, l_operand, r_operand, result);
+                    break;
+                case IBV_M_DATA_TYPE_INT32:
+                    MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, int32_t, l_operand, r_operand, result);
+                    break;
+                case IBV_M_DATA_TYPE_INT64:
+                    MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, int64_t, l_operand, r_operand, result);
+                    break;
+                case IBV_M_DATA_TYPE_FLOAT32:
+                    MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, float, l_operand, r_operand, result);
+                    break;
+                case IBV_M_DATA_TYPE_FLOAT64:
+                    MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(coll_request->op->op_type, double, l_operand, r_operand, result);
+                    break;
+                default:
+                    IBOFFLOAD_VERBOSE(10, ("Unsupported data type: %s.\n", dtype->name));
+                    return OMPI_ERROR;
+            }
+
+            break;
+
+        default:
+            IBOFFLOAD_VERBOSE(10, ("Unsupported op: %s.\n", coll_request->op->o_name));
+            return OMPI_ERROR;
+    }
+
+    memcpy(coll_request->buffer_info[RBUF].buf, &result, coll_frag->unpack_size);
+    IBOFFLOAD_VERBOSE(10, ("The output data after calc is %lf, result %lf, l_operand %lf, r_operand %lf: "
+                           "sbuf addr %p, rbuf addr %p.\n",
+                           *(double *) coll_request->buffer_info[RBUF].buf, *(double *) &result,
+                           *(double *) &l_operand, *(double *) &r_operand,
+                           coll_request->buffer_info[SBUF].buf,
+                           coll_request->buffer_info[RBUF].buf));
+
+    return OMPI_SUCCESS;
+}
+
+static int mca_bcol_iboffload_unpack_res_to_user(void *callback_data)
+{
+    int rc;
+
+    mca_bcol_iboffload_collfrag_t *coll_frag =
+                        (mca_bcol_iboffload_collfrag_t *) callback_data;
+
+    mca_bcol_iboffload_collreq_t *coll_request = coll_frag->coll_full_req;
+    mca_bcol_iboffload_task_t *task = (mca_bcol_iboffload_task_t *) coll_frag->signal_task_wr_id;
+
+    mca_bcol_iboffload_frag_t *recv_frag = task->frag;
+    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
+
+    struct ibv_context *ib_dev_context = coll_request->module->device->dev.ib_dev_context;
+
+    rc = unpack_data_from_calc(ib_dev_context,
+                               cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
+                               cm->map_ompi_to_ib_dt[coll_request->dtype->id],
+                               false, (void*) (uintptr_t) recv_frag->sg_entry.addr,
+                               NULL, coll_request->buffer_info[RBUF].buf);
+    if (0 != rc) {
+        IBOFFLOAD_VERBOSE(10, ("unpack_data_from_calc is failed: op %s, type %s\n",
+                                coll_request->op->o_name, coll_request->dtype->name));
+        return OMPI_ERROR;
+    }
+
+    IBOFFLOAD_VERBOSE(10, ("The naitive output data is %" PRId64 ".\n"
+                           "The output data is %" PRId64 ".\n",
+                            *(uint64_t *) recv_frag->sg_entry.addr,
+                            *(uint64_t *) coll_request->buffer_info[RBUF].buf));
+
+    return OMPI_SUCCESS;
+}
+
+static int
+allreduce_extra_node(mca_bcol_iboffload_module_t *iboffload,
+                     mca_bcol_iboffload_collreq_t *coll_request)
+/* (EXTRA_NODE == my_exchange_node->node_type) */
+{
+    /* local variables */
+    int rc, extra_rank;
+
+    mca_bcol_iboffload_frag_t *send_fragment,
+                              *preposted_recv_frag;
+
+    mca_bcol_iboffload_task_t *send_task,
+                              *wait_task;
+
+    struct mqe_task *last_wait, /* we need ask from completion on last wait */
+                    *last_send;
+
+    netpatterns_pair_exchange_node_t *my_exchange_node =
+                                          &iboffload->recursive_doubling_tree;
+
+    struct mqe_task **mqe_ptr_to_set;
+    mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *)
+                                                   opal_list_get_last(&coll_request->work_requests);
+
+    mqe_ptr_to_set = &coll_fragment->to_post;
+
+    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
+                 iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
+        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
+
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* I will NOT participate in the exchange - so just "register" as here */
+    extra_rank = my_exchange_node->rank_extra_source;
+
+    send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
+                            extra_rank, coll_request->qp_index,
+                            MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE, 0,
+                            SBUF,
+                            MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC);
+
+    if (OPAL_UNLIKELY(NULL == send_fragment)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* send my operand to EXCHANGE NODE */
+    send_task = mca_bcol_iboffload_get_send_task(iboffload, extra_rank,
+            coll_request->qp_index, send_fragment, coll_fragment, INLINE);
+    if (OPAL_UNLIKELY(NULL == send_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
+
+    preposted_recv_frag =
+        mca_bcol_iboffload_get_preposted_recv_frag(
+                iboffload, extra_rank, coll_request->qp_index);
+    if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+        /* RLG need cleanup */
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* Wait for final result from EXCHANGE NODE */
+    wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1,
+                            preposted_recv_frag, coll_request->qp_index, NULL);
+    if (OPAL_UNLIKELY(NULL == wait_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+    *mqe_ptr_to_set = NULL;
+
+    /* finish initializing full message descriptor */
+    coll_request->n_fragments  = 1;
+    coll_request->n_frags_sent = 1;
+
+    /* Pasha: need to set to true in upper layer */
+    coll_request->user_handle_freed = false;
+
+    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
+
+    coll_fragment->signal_task_wr_id = last_wait->wr_id;
+    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
+
+    /* post the mwr */
+    IBOFFLOAD_VERBOSE(10, ("Post tasks.\n"));
+    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_ERROR(("MQE task posting failing.\n"));
+        /* Note: need to clean up */
+        return rc;
+    }
+
+    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
+
+    return OMPI_SUCCESS;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Allreduce: adding collfrag to collfrag_pending.\n"));
+    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
+}
+
+/**
+ * Start allreduce
+ */
+static int do_exchange(mca_bcol_iboffload_module_t *iboffload,
+                    mca_bcol_iboffload_collreq_t *coll_request,
+                    struct mqe_task ***mqe_ptr_to_set,
+                    struct mqe_task **last_wait,
+                    struct ibv_sge **l_operand,
+                    struct ibv_sge **r_operand)
+{
+    int rc = OMPI_SUCCESS, exchange, pair_rank,
+        my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index;
+
+    mca_bcol_iboffload_frag_t *preposted_recv_frag;
+
+    mca_bcol_iboffload_task_t *wait_task,
+                              *calc_task;
+
+    struct mqe_task *last_send;
+    netpatterns_pair_exchange_node_t *my_exchange_node =
+                                          &iboffload->recursive_doubling_tree;
+
+    mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *)
+                                                   opal_list_get_last(&coll_request->work_requests);
+
+    size_t calc_size = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT;
+
+    pair_rank = my_exchange_node->rank_exchanges[0];
+    preposted_recv_frag =
+        mca_bcol_iboffload_get_preposted_recv_frag(
+                iboffload, pair_rank, coll_request->qp_index);
+    if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+        /* RLG need cleanup */
+        IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* Wait for send from first algorithm partner */
+    wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
+                            preposted_recv_frag, coll_request->qp_index, NULL);
+    if (OPAL_UNLIKELY(NULL == wait_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait));
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+    (*l_operand)->length = calc_size;
+    for (exchange = 1; exchange < my_exchange_node->n_exchanges; ++exchange) {
+        pair_rank = my_exchange_node->rank_exchanges[exchange];
+
+        (*r_operand) = &preposted_recv_frag->sg_entry;
+        (*r_operand)->length = calc_size;
+
+        /* Calc and send the result to the partner */
+        calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        pair_rank, coll_request->qp_index, NULL,
+                        *l_operand, *r_operand,
+                        coll_request, NO_INLINE);
+        if (OPAL_UNLIKELY(NULL == calc_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+        /* Calc and send the result to myself */
+        calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        my_rank, coll_request->qp_index, NULL,
+                        *l_operand, *r_operand, coll_request, NO_INLINE);
+        if (OPAL_UNLIKELY(NULL == calc_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+        preposted_recv_frag =
+            mca_bcol_iboffload_get_preposted_recv_frag(
+                    iboffload, my_rank, coll_request->qp_index);
+        if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+            /* RLG need cleanup */
+            IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        /* Wait for calc from myself */
+        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1,
+                                preposted_recv_frag, coll_request->qp_index, NULL);
+        if (NULL == wait_task) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait));
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+        (*l_operand) = &preposted_recv_frag->sg_entry;
+        (*l_operand)->length = calc_size;
+
+        preposted_recv_frag =
+            mca_bcol_iboffload_get_preposted_recv_frag(
+                    iboffload, pair_rank, coll_request->qp_index);
+        if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+            /* RLG need cleanup */
+            IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        /* Wait for calc from the current algorithm partner */
+        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
+                                preposted_recv_frag, coll_request->qp_index, NULL);
+        if (OPAL_UNLIKELY(NULL == wait_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait));
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+    }
+
+    (*r_operand) = &preposted_recv_frag->sg_entry;
+    (*r_operand)->length = calc_size;
+
+    return OMPI_SUCCESS;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending"));
+    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
+}
+
+/* Power of 2 case */
+static int
+pure_recursive_doubling(mca_bcol_iboffload_module_t *iboffload,
+                        mca_bcol_iboffload_collreq_t *coll_request)
+{
+    /* local variables */
+    int rc = OMPI_SUCCESS, pair_rank,
+        my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index;
+
+    struct mqe_task *last_send,
+                    *last_wait;
+
+    mca_bcol_iboffload_task_t *send_task,
+                              *wait_task,
+                              *calc_task;
+
+    mca_bcol_iboffload_frag_t *send_fragment,
+                              *preposted_recv_frag;
+
+    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
+    netpatterns_pair_exchange_node_t *my_exchange_node =
+                                          &iboffload->recursive_doubling_tree;
+
+    struct ibv_sge *r_operand = NULL,
+                   *l_operand = NULL;
+
+    struct mqe_task **mqe_ptr_to_set;
+    mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *)
+                                                   opal_list_get_last(&coll_request->work_requests);
+
+    mqe_ptr_to_set = &coll_fragment->to_post;
+
+    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
+                 iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
+        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
+
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    IBOFFLOAD_VERBOSE(10, ("Allreduce starting: type %d op %d, "
+                       "n_extra_sources - %d.\n", cm->map_ompi_to_ib_dt[coll_request->dtype->id],
+                        cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
+                        my_exchange_node->n_extra_sources));
+
+    pair_rank = my_exchange_node->rank_exchanges[0];
+
+    send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
+                            pair_rank, coll_request->qp_index,
+                            (MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT), 0,
+                            SBUF,
+                            MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC);
+    if (OPAL_UNLIKELY(NULL == send_fragment)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+/* Vasily: NO_INLINE ????? */
+    /* send my operand to the first algorithm partner */
+    send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank,
+            coll_request->qp_index, send_fragment, coll_fragment, NO_INLINE);
+    if (OPAL_UNLIKELY(NULL == send_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
+
+    l_operand = &send_fragment->sg_entry;
+    /* Recursive-doubling exchange */
+    rc = do_exchange(iboffload, coll_request, &mqe_ptr_to_set,
+                          &last_wait, &l_operand, &r_operand);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
+    if (false == coll_request->do_calc_in_cpu) {
+        /* Calc and send the result to myself */
+        calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        my_rank, coll_request->qp_index, NULL,
+                        l_operand,
+                        r_operand, coll_request, NO_INLINE);
+        if (OPAL_UNLIKELY(NULL == calc_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+        preposted_recv_frag =
+            mca_bcol_iboffload_get_preposted_recv_frag(
+                    iboffload, my_rank, coll_request->qp_index);
+        if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+            /* RLG need cleanup */
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        /* Wait for calc from myself */
+        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1,
+                        preposted_recv_frag, coll_request->qp_index, NULL);
+        if (OPAL_UNLIKELY(NULL == wait_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+    } else {
+        coll_request->l_operand = l_operand->addr;
+        coll_request->r_operand = r_operand->addr;
+    }
+
+    *mqe_ptr_to_set = NULL;
+/* Vasily: TODO with MACRO */
+    /* finish initializing full message descriptor */
+    coll_request->n_fragments  = 1;
+    coll_request->n_frags_sent = 1;
+
+    /* Pasha: need to set to true in upper layer */
+    coll_request->user_handle_freed = false;
+
+    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
+
+    coll_fragment->signal_task_wr_id = last_wait->wr_id;
+    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
+
+    /* post the mwr */
+    IBOFFLOAD_VERBOSE(10, ("Post tasks.\n"));
+    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_ERROR(("MQE task posting failing.\n"));
+        /* Note: need to clean up */
+        return rc;
+    }
+
+    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
+
+    return OMPI_SUCCESS;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending"));
+    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
+}
+
+static int rdma_do_exchange(mca_bcol_iboffload_module_t *iboffload,
+                    mca_bcol_iboffload_collreq_t *coll_request,
+                    struct mqe_task ***mqe_ptr_to_set,
+                    struct mqe_task **last_wait,
+                    struct ibv_sge **l_operand,
+                    struct ibv_sge **r_operand)
+{
+    int rc = OMPI_SUCCESS, exchange, pair_rank,
+        my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index;
+
+    mca_bcol_iboffload_frag_t *preposted_recv_frag;
+
+    mca_bcol_iboffload_task_t *wait_task,
+                              *calc_task;
+
+    struct mqe_task *last_send;
+    netpatterns_pair_exchange_node_t *my_exchange_node =
+                                          &iboffload->recursive_doubling_tree;
+
+    mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *)
+                                                   opal_list_get_last(&coll_request->work_requests);
+
+    const size_t calc_size = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT;
+    size_t remote_offset = calc_size;
+    size_t self_offset = 0;
+
+    pair_rank = my_exchange_node->rank_exchanges[0];
+    preposted_recv_frag =
+        mca_bcol_iboffload_get_preposted_recv_frag(
+                iboffload, pair_rank, coll_request->qp_index);
+    if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+        /* RLG need cleanup */
+        IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* Wait for send from first algorithm partner */
+    wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
+                            preposted_recv_frag, coll_request->qp_index, NULL);
+    if (OPAL_UNLIKELY(NULL == wait_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait));
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+    (*l_operand)->length = 2 * calc_size ;
+    for (exchange = 1; exchange < my_exchange_node->n_exchanges; ++exchange) {
+        pair_rank = my_exchange_node->rank_exchanges[exchange];
+        /* Pasha: Not used
+        (*r_operand) = &preposted_recv_frag->sg_entry;
+        (*r_operand)->length = calc_size;
+        */
+
+        remote_offset +=  2 * calc_size;
+        self_offset +=  2 * calc_size;
+
+        /* Calc and send the result to the partner */
+        /*
+        calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        pair_rank, coll_request->qp_index, NULL,
+                        *l_operand, *r_operand,
+                        coll_request, NO_INLINE);
+                        */
+        calc_task = mca_bcol_iboffload_get_rdma_calc_task(iboffload,
+                        pair_rank, coll_request->qp_index, NULL,
+                        *l_operand, NULL,
+                        coll_request, remote_offset);
+        if (OPAL_UNLIKELY(NULL == calc_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+        /* Calc and send the result to myself */
+        /*
+        calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        my_rank, coll_request->qp_index, NULL,
+                        *l_operand, NULL,
+                        coll_request, NO_INLINE);
+                        */
+        calc_task = mca_bcol_iboffload_get_rdma_calc_task(iboffload,
+                        my_rank, coll_request->qp_index, NULL,
+                        *l_operand, NULL,
+                        coll_request, self_offset);
+        if (OPAL_UNLIKELY(NULL == calc_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), calc_task, last_send);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+        preposted_recv_frag =
+            mca_bcol_iboffload_get_preposted_recv_frag(
+                    iboffload, my_rank, coll_request->qp_index);
+        if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+            /* RLG need cleanup */
+            IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        /* Wait for calc from myself */
+        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1,
+                                preposted_recv_frag, coll_request->qp_index, NULL);
+        if (NULL == wait_task) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait));
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+        /*
+        (*l_operand) = &preposted_recv_frag->sg_entry;
+        */
+
+        /* (*l_operand)->length = 2 * calc_size; */
+        (*l_operand)->addr = (uint64_t) (uintptr_t) ((unsigned char *) (*l_operand)->addr + 2 * calc_size);
+
+        preposted_recv_frag =
+            mca_bcol_iboffload_get_preposted_recv_frag(
+                    iboffload, pair_rank, coll_request->qp_index);
+        if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+            /* RLG need cleanup */
+            IBOFFLOAD_VERBOSE(10, ("Get prepost recv fag fail.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        /* Wait for calc from the current algorithm partner */
+        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, pair_rank, 1,
+                                preposted_recv_frag, coll_request->qp_index, NULL);
+        if (OPAL_UNLIKELY(NULL == wait_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST((*mqe_ptr_to_set), wait_task, (*last_wait));
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+    }
+    /* Pasha: not used
+    (*r_operand) = &preposted_recv_frag->sg_entry;
+    (*r_operand)->length = calc_size;
+    */
+
+    return OMPI_SUCCESS;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending"));
+    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
+}
+
+#define ALLREDUCE_BASE_OFFSET (MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT)
+
+/* RDMA Recursive doubling + cache friendly version */
+static int
+rdma_pure_recursive_doubling(mca_bcol_iboffload_module_t *iboffload,
+                        mca_bcol_iboffload_collreq_t *coll_request)
+{
+    /* local variables */
+    int rc = OMPI_SUCCESS, pair_rank,
+        my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index;
+
+    struct mqe_task *last_send,
+                    *last_wait;
+
+    mca_bcol_iboffload_task_t *send_task,
+                              *wait_task,
+                              *calc_task;
+
+    mca_bcol_iboffload_frag_t *send_fragment,
+                              *preposted_recv_frag;
+    struct ibv_sge operand;
+
+    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
+    netpatterns_pair_exchange_node_t *my_exchange_node =
+                                          &iboffload->recursive_doubling_tree;
+
+    struct ibv_sge *r_operand = NULL,
+                   *l_operand = NULL;
+
+    struct mqe_task **mqe_ptr_to_set;
+    mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *)
+                                                   opal_list_get_last(&coll_request->work_requests);
+
+    mqe_ptr_to_set = &coll_fragment->to_post;
+
+    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
+                 iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
+        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
+
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    IBOFFLOAD_VERBOSE(10, ("Allreduce starting: type %d op %d, "
+                       "n_extra_sources - %d.\n", cm->map_ompi_to_ib_dt[coll_request->dtype->id],
+                        cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
+                        my_exchange_node->n_extra_sources));
+
+    pair_rank = my_exchange_node->rank_exchanges[0];
+
+    send_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
+                            pair_rank, coll_request->qp_index,
+                            (MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE + MCA_IBOFFLOAD_CALC_SIZE_EXT),
+                            0,
+                            SBUF,
+                            MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC);
+    if (OPAL_UNLIKELY(NULL == send_fragment)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+    /* Vasily: NO_INLINE ????? */
+    /* send my operand to the first algorithm partner */
+    /* send_task = mca_bcol_iboffload_get_send_task(iboffload, pair_rank,
+            coll_request->qp_index, send_fragment, coll_fragment, NO_INLINE); */
+
+    send_task = mca_bcol_iboffload_get_rdma_task(
+            pair_rank, ALLREDUCE_BASE_OFFSET,
+            send_fragment, iboffload, coll_fragment);
+    if (OPAL_UNLIKELY(NULL == send_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting send task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* Pasha: ugly but faster, set inline on first send */
+    SENDWR(send_task)->send_flags |= IBV_SEND_INLINE;
+
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, send_task, last_send);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, send_task);
+
+    /* l_operand = &send_fragment->sg_entry; */
+    operand = send_fragment->sg_entry;
+    l_operand = &operand;
+
+    /* Recursive-doubling exchange */
+    rc = rdma_do_exchange(iboffload, coll_request, &mqe_ptr_to_set,
+                          &last_wait, &l_operand, &r_operand);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
+
+    /* Pasha: This flow is broken, print error */
+    if (false == coll_request->do_calc_in_cpu) {
+        ML_ERROR(("Calc in CPU must be enabled !!!"));
+        /* Calc and send the result to myself */
+        calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        my_rank, coll_request->qp_index, NULL,
+                        l_operand,
+                        r_operand, coll_request, NO_INLINE);
+        if (OPAL_UNLIKELY(NULL == calc_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+        preposted_recv_frag =
+            mca_bcol_iboffload_get_preposted_recv_frag(
+                    iboffload, my_rank, coll_request->qp_index);
+        if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+            /* RLG need cleanup */
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        /* Wait for calc from myself */
+        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1,
+                        preposted_recv_frag, coll_request->qp_index, NULL);
+        if (OPAL_UNLIKELY(NULL == wait_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+    } else {
+        coll_request->l_operand = (uint64_t) (uintptr_t)
+            ((unsigned char *)l_operand->addr);
+        coll_request->r_operand = (uint64_t) (uintptr_t)
+            ((unsigned char *) (coll_request->l_operand) + ALLREDUCE_BASE_OFFSET);
+    }
+
+    *mqe_ptr_to_set = NULL;
+/* Vasily: TODO with MACRO */
+    /* finish initializing full message descriptor */
+    coll_request->n_fragments  = 1;
+    coll_request->n_frags_sent = 1;
+
+    /* Pasha: need to set to true in upper layer */
+    coll_request->user_handle_freed = false;
+
+    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
+
+    coll_fragment->signal_task_wr_id = last_wait->wr_id;
+    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
+
+    /* post the mwr */
+    IBOFFLOAD_VERBOSE(10, ("Post tasks.\n"));
+    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_ERROR(("MQE task posting failing.\n"));
+        /* Note: need to clean up */
+        return rc;
+    }
+
+    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
+
+    return OMPI_SUCCESS;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending"));
+    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
+}
+/*
+ * non power of 2 & EXCHANGE_NODE case,
+ * need to wait for message from "extra" proc.
+ */
+static int
+non_pure_recursive_doubling(mca_bcol_iboffload_module_t *iboffload,
+                            mca_bcol_iboffload_collreq_t *coll_request)
+{
+    /* local variables */
+    int rc = OMPI_SUCCESS, extra_rank, pair_rank,
+        my_rank = ((mca_sbgp_base_module_t *) iboffload->ibnet)->my_index;
+
+    mca_bcol_iboffload_frag_t *calc_fragment,
+                              *preposted_recv_frag;
+
+    mca_bcol_iboffload_task_t *wait_task,
+                              *calc_task;
+
+    struct ibv_sge *r_operand = NULL,
+                   *l_operand = NULL;
+
+    struct mqe_task *last_wait, /* we need ask from completion on last wait */
+                    *last_send;
+
+    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
+    netpatterns_pair_exchange_node_t *my_exchange_node =
+                                          &iboffload->recursive_doubling_tree;
+
+    struct mqe_task **mqe_ptr_to_set;
+    mca_bcol_iboffload_collfrag_t *coll_fragment = (mca_bcol_iboffload_collfrag_t *)
+                                                   opal_list_get_last(&coll_request->work_requests);
+
+    mqe_ptr_to_set = &coll_fragment->to_post;
+
+    if (OPAL_UNLIKELY(false == BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(
+                 iboffload, coll_fragment->mq_index, coll_fragment->mq_credits))) {
+        IBOFFLOAD_VERBOSE(10, ("There are not enough credits on MQ.\n"));
+
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    IBOFFLOAD_VERBOSE(10, ("Allreduce starting: type %d op %d, "
+                       "n_extra_sources - %d.\n", cm->map_ompi_to_ib_dt[coll_request->dtype->id],
+                        cm->map_ompi_to_ib_calcs[coll_request->op->op_type],
+                        my_exchange_node->n_extra_sources));
+
+    extra_rank = my_exchange_node->rank_extra_source;
+
+    preposted_recv_frag =
+        mca_bcol_iboffload_get_preposted_recv_frag(
+                iboffload, extra_rank, coll_request->qp_index);
+    if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+        /* RLG need cleanup */
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* Wait for data from extra node */
+    wait_task = mca_bcol_iboffload_get_wait_task(iboffload, extra_rank, 1,
+                            preposted_recv_frag, coll_request->qp_index, NULL);
+    if (OPAL_UNLIKELY(NULL == wait_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+    pair_rank = my_exchange_node->rank_exchanges[0];
+
+    calc_fragment = mca_bcol_iboffload_get_send_frag(coll_request,
+                            pair_rank, coll_request->qp_index,
+                            MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE +
+                            MCA_IBOFFLOAD_CALC_SIZE_EXT, 0,
+                            SBUF,
+                            MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC);
+    if (OPAL_UNLIKELY(NULL == calc_fragment)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting and packing send frag.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* Calc extra node operand with mine and send the result
+       to the first algorithm partner */
+    preposted_recv_frag->sg_entry.length = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE +
+                                           MCA_IBOFFLOAD_CALC_SIZE_EXT;
+    calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        pair_rank, coll_request->qp_index, calc_fragment,
+                        &preposted_recv_frag->sg_entry,
+                        &calc_fragment->sg_entry, coll_request, NO_INLINE);
+    if (OPAL_UNLIKELY(NULL == calc_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+    /* Calc extra node operand with mine and store the result on my buff */
+    calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        my_rank, coll_request->qp_index, NULL,
+                        &preposted_recv_frag->sg_entry,
+                        &calc_fragment->sg_entry, coll_request, NO_INLINE);
+    if (OPAL_UNLIKELY(NULL == calc_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+    preposted_recv_frag =
+        mca_bcol_iboffload_get_preposted_recv_frag(
+                iboffload, my_rank, coll_request->qp_index);
+    if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+        /* RLG need cleanup */
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    /* Wait for calc from myself */
+    wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1,
+                            preposted_recv_frag, coll_request->qp_index, NULL);
+    if (OPAL_UNLIKELY(NULL == wait_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+
+    l_operand = &preposted_recv_frag->sg_entry;
+    l_operand->length = MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE +
+                        MCA_IBOFFLOAD_CALC_SIZE_EXT;
+    /* Recursive-doubling exchange */
+    rc = do_exchange(iboffload, coll_request, &mqe_ptr_to_set,
+                     &last_wait, &l_operand, &r_operand);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
+
+    /* Need to send message to "extra" proc =>
+       one more final result calc for extra node */
+    calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        extra_rank, coll_request->qp_index, NULL,
+                        l_operand,
+                        r_operand, coll_request, NO_INLINE);
+    if (OPAL_UNLIKELY(NULL == calc_task)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+        rc = OMPI_ERR_RESOURCE_BUSY;
+        goto out_of_resources;
+    }
+
+    APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send);
+    MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+    if (false == coll_request->do_calc_in_cpu) {
+        /* Calc and send the result to myself */
+        calc_task = mca_bcol_iboffload_get_calc_task(iboffload,
+                        my_rank, coll_request->qp_index, NULL,
+                        l_operand,
+                        r_operand, coll_request, NO_INLINE);
+        if (OPAL_UNLIKELY(NULL == calc_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting calc task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST(mqe_ptr_to_set, calc_task, last_send);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, calc_task);
+
+        preposted_recv_frag =
+            mca_bcol_iboffload_get_preposted_recv_frag(
+                    iboffload, my_rank, coll_request->qp_index);
+        if (OPAL_UNLIKELY(NULL == preposted_recv_frag)) {
+            /* RLG need cleanup */
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        /* Wait for calc from myself */
+        wait_task = mca_bcol_iboffload_get_wait_task(iboffload, my_rank, 1,
+                        preposted_recv_frag, coll_request->qp_index, NULL);
+        if (OPAL_UNLIKELY(NULL == wait_task)) {
+            IBOFFLOAD_VERBOSE(10, ("Failing for getting wait task.\n"));
+            rc = OMPI_ERR_RESOURCE_BUSY;
+            goto out_of_resources;
+        }
+
+        APPEND_TO_TASKLIST(mqe_ptr_to_set, wait_task, last_wait);
+        MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(coll_fragment->task_next, wait_task);
+    } else {
+        coll_request->l_operand = l_operand->addr;
+        coll_request->r_operand = r_operand->addr;
+    }
+
+    *mqe_ptr_to_set = NULL;
+
+    /* finish initializing full message descriptor */
+    coll_request->n_fragments  = 1;
+    coll_request->n_frags_sent = 1;
+
+    assert(NULL != last_wait);
+
+    last_wait->flags |= MQE_WR_FLAG_SIGNAL;
+    coll_fragment->signal_task_wr_id = last_wait->wr_id;
+    last_wait->wr_id = (uint64_t) (uintptr_t) coll_fragment;
+
+    /* post the mwr */
+    IBOFFLOAD_VERBOSE(10, ("Post tasks.\n"));
+    rc = mca_bcol_iboffload_post_mqe_tasks(iboffload, coll_fragment->to_post);
+    if(OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_ERROR(("MQE task posting failing.\n"));
+        /* Note: need to clean up */
+        return rc;
+    }
+
+    MCA_BCOL_UPDATE_ORDER_COUNTER(&iboffload->super, coll_request->order_info);
+
+    return OMPI_SUCCESS;
+
+out_of_resources:
+    /* Release all resources */
+    IBOFFLOAD_VERBOSE(10, ("Adding collfrag to collfrag_pending"));
+    return mca_bcol_iboffload_free_resources_and_move_to_pending(coll_fragment, iboffload);
+}
+
+static int mca_bcol_iboffload_allreduce_init(
+                               bcol_function_args_t *fn_arguments,
+                               mca_bcol_iboffload_module_t *iboffload,
+                               struct mca_bcol_iboffload_collreq_t **coll_request,
+                               bool if_bcol_last)
+{
+    int rc;
+
+    bool exclude_case;
+    ompi_free_list_item_t *item;
+    mca_bcol_iboffload_collfrag_t *coll_fragment;
+
+    mca_bcol_iboffload_component_t *cm = &mca_bcol_iboffload_component;
+
+    IBOFFLOAD_VERBOSE(10, ("Calling for mca_bcol_iboffload_allreduce_init.\n"));
+
+    OMPI_FREE_LIST_WAIT(&cm->collreqs_free, item, rc);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("Failing for coll request free list waiting.\n"));
+        return rc;
+    }
+
+    (*coll_request) = (mca_bcol_iboffload_collreq_t *) item;
+    (*coll_request)->progress_fn = iboffload->allreduce_algth;
+
+    (*coll_request)->if_bcol_last = if_bcol_last;
+
+    exclude_case = (non_pure_recursive_doubling == iboffload->allreduce_algth &&
+                                    (OMPI_OP_SUM == fn_arguments->op->op_type &&
+                                     OMPI_DATATYPE_MPI_DOUBLE == fn_arguments->dtype->id));
+
+    (*coll_request)->do_calc_in_cpu = cm->last_calc_in_cpu && !exclude_case;
+
+    if (false == (*coll_request)->do_calc_in_cpu ||
+            allreduce_extra_node == iboffload->allreduce_algth) {
+        (*coll_request)->do_calc_in_cpu = false; /* Relevant for extra node only */
+        (*coll_request)->completion_cb_fn =
+                        mca_bcol_iboffload_unpack_res_to_user;
+    } else {
+        (*coll_request)->completion_cb_fn =
+                       mca_bcol_iboffload_calc_res_to_user;
+    }
+
+    (*coll_request)->module = iboffload;
+    (*coll_request)->op = fn_arguments->op;
+
+    (*coll_request)->dtype = fn_arguments->dtype;
+    (*coll_request)->count = fn_arguments->count;
+
+    (*coll_request)->ml_buffer_index = fn_arguments->buffer_index;
+    (*coll_request)->buffer_info[SBUF].lkey = iboffload->rdma_block.ib_info.lkey;
+
+    (*coll_request)->order_info = &fn_arguments->order_info;
+
+    /* ML buffer was provided, no need to pack the data.
+     * It is few assumption here:
+     * we CAN touch and change ML buffer
+     */
+    (*coll_request)->buffer_info[SBUF].buf = (void *) (
+            (unsigned char *) fn_arguments->sbuf +
+                     (size_t) fn_arguments->sbuf_offset);
+
+    (*coll_request)->buffer_info[SBUF].offset = fn_arguments->sbuf_offset;
+
+    (*coll_request)->buffer_info[RBUF].buf = (void *) (
+        (unsigned char *) fn_arguments->rbuf +
+                 (size_t) fn_arguments->rbuf_offset);
+
+    (*coll_request)->buffer_info[RBUF].offset = fn_arguments->rbuf_offset;
+
+    if(mca_bcol_iboffload_component.enable_rdma_calc) {
+        (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_BARRIER;
+    } else {
+        (*coll_request)->qp_index = MCA_BCOL_IBOFFLOAD_QP_REGULAR;
+    }
+
+    (*coll_request)->n_frag_mpi_complete = 0;
+    (*coll_request)->n_frag_net_complete = 0;
+
+    fn_arguments->bcol_opaque_data = (void *) (*coll_request);
+
+    /*
+     * setup collective work request
+     */
+
+    /* get collective frag */
+    coll_fragment = &((*coll_request)->first_collfrag);
+    mca_bcol_iboffload_collfrag_init(coll_fragment);
+
+    coll_fragment->mq_index = COLL_MQ;
+    coll_fragment->alg = RECURSIVE_DOUBLING_ALLREDUCE_ALG;
+
+    coll_fragment->mq_credits =
+                iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG];
+
+    /* set pointers for (coll frag) <-> (coll full request) */
+    MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(*coll_request, coll_fragment);
+
+    coll_fragment->unpack_size =
+                mca_bcol_base_get_buff_length(fn_arguments->dtype, fn_arguments->count);
+
+    IBOFFLOAD_VERBOSE(10, ("The input data is %lf", *(double *) (*coll_request)->buffer_info[SBUF].buf));
+
+    return OMPI_SUCCESS;
+}
+
+static int mca_bcol_iboffload_allreduce_intra(bcol_function_args_t *fn_arguments,
+                                              struct coll_ml_function_t *const_args)
+{
+    /* local variables */
+    int rc;
+
+    mca_bcol_iboffload_collreq_t *coll_request = NULL;
+    mca_bcol_iboffload_module_t *iboffload =
+                 (mca_bcol_iboffload_module_t *) const_args->bcol_module;
+
+    /* Pasha: please do not touch this line, it used for ML buffer recycling barrier call */
+    bool if_bcol_last = ((const_args->index_of_this_type_in_collective + 1) ==
+                          const_args->n_of_this_type_in_collective);
+
+    MCA_BCOL_CHECK_ORDER(const_args->bcol_module, fn_arguments);
+
+    IBOFFLOAD_VERBOSE(10, ("n_of_this_type_in_a_row %d, index_in_consecutive_same_bcol_calls %d",
+                            const_args->n_of_this_type_in_a_row,
+                            const_args->index_in_consecutive_same_bcol_calls + 1));
+
+    IBOFFLOAD_VERBOSE(10, ("Allreduce started.\n"));
+    fn_arguments->result_in_rbuf = true;
+
+    rc = mca_bcol_iboffload_allreduce_init(fn_arguments, iboffload,
+                                           &coll_request, if_bcol_last);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        IBOFFLOAD_VERBOSE(10, ("Get error from mca_bcol_iboffload_allreduce_init.\n"));
+        return rc;
+    }
+
+    /* Allreduce starting */
+    rc = iboffload->allreduce_algth(iboffload, coll_request);
+    if (OPAL_UNLIKELY(OMPI_ERROR == rc)) {
+        return BCOL_FN_NOT_STARTED;
+    }
+
+    IBOFFLOAD_VERBOSE(10, ("Wait for completions.\n"));
+
+    /* done */
+    return BCOL_FN_STARTED;
+}
+
+static int mca_bcol_iboffload_allreduce_progress(
+                        bcol_function_args_t *input_args,
+                        struct coll_ml_function_t *const_args)
+{
+    mca_bcol_iboffload_collreq_t *coll_request =
+                 (mca_bcol_iboffload_collreq_t *)
+                                   input_args->bcol_opaque_data;
+
+    if (BCOL_IS_COMPLETED(coll_request)) {
+        coll_request->user_handle_freed = true;
+        if (COLLREQ_IS_DONE(coll_request)) {
+            IBOFFLOAD_VERBOSE(10, ("Coll request already done.\n"));
+            RELEASE_COLLREQ(coll_request);
+        }
+
+        IBOFFLOAD_VERBOSE(10, ("Allreduce already done.\n"));
+        return BCOL_FN_COMPLETE;
+    }
+
+    return BCOL_FN_STARTED;
+}
+
+int mca_bcol_iboffload_allreduce_first_call(mca_bcol_iboffload_module_t *iboffload,
+                                            mca_bcol_iboffload_collreq_t *coll_request)
+{
+    netpatterns_pair_exchange_node_t *my_exchange_node =
+                                          &iboffload->recursive_doubling_tree;
+
+    int i = 0, my_rank = iboffload->ibnet->super.my_index,
+        n_exchanges = my_exchange_node->n_exchanges,
+        *exchanges = my_exchange_node->rank_exchanges,
+        n_extra_src = my_exchange_node->n_extra_sources,
+        rank_extra_src = my_exchange_node->rank_extra_source;
+
+    mca_bcol_iboffload_endpoint_t *ep = iboffload->endpoints[my_rank];
+
+    /* Connecting to myself */
+    while (OMPI_SUCCESS !=
+            check_endpoint_state(ep, NULL, NULL)) {
+        opal_progress();
+    }
+
+    iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] = 0;
+
+    if (0 < n_extra_src) {
+        iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] += 4; /* Two CALCs and two WAITs tasks */
+        ep = iboffload->endpoints[rank_extra_src];
+        while (OMPI_SUCCESS !=
+                check_endpoint_state(ep, NULL, NULL)) {
+            opal_progress();
+        }
+    }
+
+    for (i = 0; i < n_exchanges; ++i) {
+        iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] += 4; /* Two CALCs and two WAITs tasks */
+        ep = iboffload->endpoints[exchanges[i]];
+
+        while (OMPI_SUCCESS !=
+                check_endpoint_state(ep, NULL, NULL)) {
+            opal_progress();
+        }
+    }
+
+    iboffload->alg_task_consump[RECURSIVE_DOUBLING_ALLREDUCE_ALG] += 4; /* Two CALCs and two WAITs tasks */
+
+    if (0 < my_exchange_node->n_extra_sources) {
+        iboffload->allreduce_algth =
+                (EXTRA_NODE == my_exchange_node->node_type)?
+                 allreduce_extra_node:
+                 non_pure_recursive_doubling;
+    } else {
+        if(mca_bcol_iboffload_component.enable_rdma_calc) {
+            iboffload->allreduce_algth =
+                rdma_pure_recursive_doubling;
+        } else {
+            iboffload->allreduce_algth =
+                pure_recursive_doubling;
+        }
+    }
+
+    return iboffload->allreduce_algth(iboffload, coll_request);
+}
+
+int mca_bcol_iboffload_allreduce_register(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    IBOFFLOAD_VERBOSE(10, ("Register iboffload Allreduce.\n"));
+
+    comm_attribs.bcoll_type = BCOL_ALLREDUCE;
+
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 1024 * 1024;
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000; /* range 1 */
+
+    inv_attribs.datatype_bitmap = 0xffffffff;
+    inv_attribs.op_types_bitmap = 0xffffffff;
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+
+    mca_bcol_base_set_attributes(super,
+            &comm_attribs, &inv_attribs,
+            mca_bcol_iboffload_allreduce_intra,
+            mca_bcol_iboffload_allreduce_progress);
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.h b/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.h
index fca156cf0c..5f8e2e5adf 100644
--- a/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.h
+++ b/ompi/mca/bcol/iboffload/bcol_iboffload_collfrag.h
@@ -90,6 +90,10 @@ struct mca_bcol_iboffload_collfrag_t {
      * there isn't any wait in the coll request
      */
     int32_t last_wait_num;
+    /* fragment descriptor for non contiguous data */
+    bcol_fragment_descriptor_t *bcol_frag_info;
+    /* frag-len of ml buffer */
+    int frag_len;
 };
 typedef struct mca_bcol_iboffload_collfrag_t mca_bcol_iboffload_collfrag_t;
 OBJ_CLASS_DECLARATION(mca_bcol_iboffload_collfrag_t);
diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_frag.c b/ompi/mca/bcol/iboffload/bcol_iboffload_frag.c
index e9a03eed9c..0ecf1ef62e 100644
--- a/ompi/mca/bcol/iboffload/bcol_iboffload_frag.c
+++ b/ompi/mca/bcol/iboffload/bcol_iboffload_frag.c
@@ -48,7 +48,7 @@ OBJ_CLASS_INSTANCE(
         frag_constructor,
         NULL);
 
-#if 0
+
 static mca_bcol_iboffload_frag_t*
     mca_bcol_iboffload_get_ml_frag_calc(mca_bcol_iboffload_module_t *iboffload,
                                     mca_bcol_iboffload_collreq_t *coll_request,
@@ -85,8 +85,6 @@ static mca_bcol_iboffload_frag_t*
 
     return fragment;
 }
-#endif
-
 
 static mca_bcol_iboffload_frag_t *
 mca_bcol_iboffload_get_packed_frag(mca_bcol_iboffload_module_t *iboffload,
@@ -130,7 +128,6 @@ mca_bcol_iboffload_get_packed_frag(mca_bcol_iboffload_module_t *iboffload,
     return frag;
 }
 
-#if 0
 static mca_bcol_iboffload_frag_t *
 mca_bcol_iboffload_get_calc_frag(mca_bcol_iboffload_module_t *iboffload, int qp_index,
                                  struct mca_bcol_iboffload_collreq_t *coll_request)
@@ -169,7 +166,6 @@ mca_bcol_iboffload_get_calc_frag(mca_bcol_iboffload_module_t *iboffload, int qp_
 
     return frag;
 }
-#endif
 
 mca_bcol_iboffload_frag_t*
 mca_bcol_iboffload_get_send_frag(mca_bcol_iboffload_collreq_t *coll_request,
@@ -219,24 +215,24 @@ mca_bcol_iboffload_get_send_frag(mca_bcol_iboffload_collreq_t *coll_request,
             IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CONVERT"));
             frag = mca_bcol_iboffload_get_packed_frag(iboffload, destination,
                          qp_index, len, &coll_request->send_convertor);
-#if 0
+
         break;
         case MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC:
             IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_CALC"));
             frag = mca_bcol_iboffload_get_calc_frag(iboffload, qp_index, coll_request);
-#endif
+
         break;
         case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML:
             IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML"));
             frag = mca_bcol_iboffload_get_ml_frag(
                   iboffload, qp_index, len, coll_request->buffer_info[buf_index].lkey,
                   (uint64_t)(uintptr_t) coll_request->buffer_info[buf_index].buf + src_offset);
-#if 0
+
         break;
         case MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC:
             frag = mca_bcol_iboffload_get_ml_frag_calc(iboffload, coll_request, len, src_offset);
             IBOFFLOAD_VERBOSE(10, ("Getting MCA_BCOL_IBOFFLOAD_SEND_FRAG_ML_CALC"));
-#endif
+
         break;
         default:
             IBOFFLOAD_VERBOSE(10, ("Getting default"));
diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_mca.c b/ompi/mca/bcol/iboffload/bcol_iboffload_mca.c
index 3adc67c10f..f62da05363 100644
--- a/ompi/mca/bcol/iboffload/bcol_iboffload_mca.c
+++ b/ompi/mca/bcol/iboffload/bcol_iboffload_mca.c
@@ -267,10 +267,19 @@ int mca_bcol_iboffload_register_params(void)
                   "Increment size of free lists (must be >= 1)",
                   32, &mca_bcol_iboffload_component.free_list_inc,
                   REGINT_GE_ONE));
+    /* rdma mpool no longer exists - must use the grdma mpool component, should resolve errors in
+     * mtt testing
+     */
+    /*
     CHECK(reg_string("mpool", NULL,
                      "Name of the memory pool to be used (it is unlikely that you will ever want to change this",
                      "rdma", &mca_bcol_iboffload_component.mpool_name,
                      0));
+    */
+    CHECK(reg_string("mpool", NULL,
+                     "Name of the memory pool to be used (it is unlikely that you will ever want to change this",
+                     "grdma", &mca_bcol_iboffload_component.mpool_name,
+                     0));
     CHECK(reg_int("cq_size", "cq_size",
                   "Size of the OpenFabrics completion "
                   "queue (will automatically be set to a minimum of "
diff --git a/ompi/mca/bcol/iboffload/bcol_iboffload_module.c b/ompi/mca/bcol/iboffload/bcol_iboffload_module.c
index 94034a6a44..fccf26a006 100644
--- a/ompi/mca/bcol/iboffload/bcol_iboffload_module.c
+++ b/ompi/mca/bcol/iboffload/bcol_iboffload_module.c
@@ -731,9 +731,9 @@ static void load_func(mca_bcol_base_module_t *super)
     super->bcol_function_init_table[BCOL_BARRIER] = mca_bcol_iboffload_barrier_register;
     super->bcol_function_init_table[BCOL_BCAST] = mca_bcol_iboffload_bcast_register;
     /*super->bcol_function_init_table[BCOL_ALLTOALL] = mca_bcol_iboffload_alltoall_register;*/
-    /*super->bcol_function_init_table[BCOL_ALLGATHER] = mca_bcol_iboffload_allgather_register;*/
+    super->bcol_function_init_table[BCOL_ALLGATHER] = mca_bcol_iboffload_allgather_register;
     super->bcol_function_init_table[BCOL_SYNC] = mca_bcol_iboffload_memsync_register;
-    /*super->bcol_function_init_table[BCOL_ALLREDUCE] = mca_bcol_iboffload_allreduce_register;*/
+    super->bcol_function_init_table[BCOL_ALLREDUCE] = mca_bcol_iboffload_allreduce_register;
 
     super->bcol_memory_init = mca_bcol_iboffload_init_buffer_memory;
 
@@ -1523,7 +1523,7 @@ int mca_bcol_iboffload_exchange_rem_addr(mca_bcol_iboffload_endpoint_t *ep)
     coll_request->user_handle_freed = true;
     /* complete the exchange - progress releases full request descriptors */
     while (!BCOL_IS_COMPLETED(coll_request)) {
-        opal_progress();
+        mca_bcol_iboffload_component_progress();
     }
 
     IBOFFLOAD_VERBOSE(10, ("RDMA addr exchange with comm rank: %d was finished.\n",
diff --git a/ompi/mca/bcol/ptpcoll/Makefile.am b/ompi/mca/bcol/ptpcoll/Makefile.am
index c9444a741b..c8addc7c00 100644
--- a/ompi/mca/bcol/ptpcoll/Makefile.am
+++ b/ompi/mca/bcol/ptpcoll/Makefile.am
@@ -1,6 +1,8 @@
 #
 # Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
-# Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+# Copyright (c) 2009-2013 Mellanox Technologies.  All rights reserved.
+# Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+#                         reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -22,9 +24,14 @@ sources = \
         bcol_ptpcoll_component.c  \
         bcol_ptpcoll_fanin.c \
         bcol_ptpcoll_fanout.c \
-        bcol_ptpcoll_module.c 
-	
-	
+        bcol_ptpcoll_module.c \
+        bcol_ptpcoll_allreduce.h \
+        bcol_ptpcoll_allreduce.c \
+        bcol_ptpcoll_reduce.h \
+        bcol_ptpcoll_reduce.c \
+        bcol_ptpcoll_allgather.c
+
+
 # Make the output library in this directory, and name it either
 # mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
 # (for static builds).
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h
index 916300f322..dda443a495 100644
--- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h
@@ -230,6 +230,12 @@ struct mca_bcol_ptpcoll_ml_buffer_desc_t {
     int          iteration;         /* buffer iteration in knomial, binomail, etc. algorithms */
     int          tag;               /* tag number that is attached to this operation */
     int          status;       /* operation status */
+    /* Fixme: Probably we can get rid of these fields by redesigning
+     * the reduce implementation
+     */
+    int          reduction_status; /* used for reduction to cache internal
+                                      reduction status */
+    bool          reduce_init_called;
 };
 typedef struct mca_bcol_ptpcoll_ml_buffer_desc_t mca_bcol_ptpcoll_ml_buffer_desc_t;
 
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allgather.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allgather.c
new file mode 100644
index 0000000000..8a2e07fd35
--- /dev/null
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allgather.c
@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "bcol_ptpcoll_allreduce.h"
+#include "ompi/mca/coll/base/coll_tags.h" /* debug */
+/*
+ * Recursive K-ing allgather
+ */
+
+/*
+ *
+ * Recurssive k-ing algorithm
+ * Example k=3 n=9
+ *
+ *
+ * Number of Exchange steps = log (basek) n
+ * Number of steps in exchange step = k (radix)
+ *
+ */
+
+int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args,
+                struct coll_ml_function_t *const_args)
+{
+    /* local variables */
+
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
+    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
+    netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree;
+    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
+    int group_size = ptpcoll_module->group_size;
+    int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */
+
+    int tag;
+    int i, j;
+    int knt;
+    int comm_src, comm_dst, src, dst;
+    int recv_offset, recv_len;
+    int send_offset, send_len;
+
+    uint32_t buffer_index = input_args->buffer_index;
+    int pow_k, tree_order;
+    int rc = OMPI_SUCCESS;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    ompi_request_t **requests =
+        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int completed = 0; /* initialized */
+    void *data_buffer = (void*)(
+            (unsigned char *) input_args->sbuf +
+            (size_t) input_args->sbuf_offset);
+    int pack_len = input_args->count * input_args->dtype->super.size;
+
+#if 0
+    fprintf(stderr,"entering p2p allgather pack_len %d. exchange node: %p\n",pack_len, exchange_node);
+#endif
+    /* initialize the iteration counter */
+    int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
+    *iteration = 0;
+
+    /* reset active request counter */
+    *active_requests = 0;
+
+    /* keep tag within the limit supported by the pml */
+    tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
+    /* mark this as a collective tag, to avoid conflict with user-level flags */
+    tag = -tag;
+
+    /* k-nomial parameters */
+    tree_order = exchange_node->tree_order;
+    pow_k = exchange_node->log_tree_order;
+
+
+    /* let's begin the collective, starting with extra ranks and their
+     * respective proxies
+     */
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+        /* then I will send to my proxy rank*/
+        dst = exchange_node->rank_extra_sources_array[0];
+        /* find rank in the communicator */
+        comm_dst = group_list[dst];
+        /* now I need to calculate my own offset */
+        knt = 0;
+        for (i = 0 ; i < my_group_index; i++){
+            knt += list_connected[i];
+        }
+
+        /* send the data to my proxy */
+        rc = MCA_PML_CALL(isend((void *) ( (unsigned char *) data_buffer +
+                        knt*pack_len),
+                        pack_len * list_connected[my_group_index],
+                        MPI_BYTE,
+                        comm_dst, tag,
+                        MCA_PML_BASE_SEND_STANDARD, comm,
+                        &(requests[*active_requests])));
+
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10,("Failed to isend data"));
+            return OMPI_ERROR;
+        }
+        ++(*active_requests);
+
+        /* now I go ahead and post the receive from my proxy */
+        comm_src = comm_dst;
+        knt = 0;
+        for( i =0; i < group_size; i++){
+            knt += list_connected[i];
+        }
+        rc = MCA_PML_CALL(irecv(data_buffer,
+                    knt * pack_len,
+                    MPI_BYTE,
+                    comm_src,
+                    tag , comm, &(requests[*active_requests])));
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
+            return OMPI_ERROR;
+        }
+
+        ++(*active_requests);
+        /* poll for completion */
+        /* this polls internally */
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(completed){
+            /* go to buffer release */
+            goto FINISHED;
+        }else{
+            /* save state and hop out
+             * nothing to save here
+             */
+            return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+    }else if ( 0 < exchange_node->n_extra_sources ) {
+
+        /* I am a proxy for someone */
+        src = exchange_node->rank_extra_sources_array[0];
+        /* find the rank in the communicator */
+        comm_src = group_list[src];
+        knt = 0;
+        for(i = 0; i < src; i++){
+            knt += list_connected[i];
+        }
+        /* post the receive */
+        rc = MCA_PML_CALL(irecv((void *) ( (unsigned char *) data_buffer
+                        + knt*pack_len),
+                        pack_len * list_connected[src],
+                        MPI_BYTE,
+                        comm_src,
+                        tag , comm, &(requests[*active_requests])));
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
+            return OMPI_ERROR;
+        }
+
+        ++(*active_requests);
+        /* poll for completion */
+        /* this routine polls internally */
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(!completed){
+            /* save state and hop out
+             * We really do need to block here so set
+             * the iteration to -1 indicating we need to
+             *  finish this part first
+             */
+            *iteration = -1;
+            return ((OMPI_SUCCESS != rc )? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+
+    }
+
+    /* we start the recursive k - ing phase */
+    /* fprintf(stderr,"tree order %d pow_k %d \n",tree_order,pow_k);*/
+    for( i = 0; i < pow_k; i++) {
+        for(j = 0; j < (tree_order - 1); j++) {
+
+            /* send phase */
+            dst = exchange_node->rank_exchanges[i][j];
+            if( dst < 0 ){
+                continue;
+            }
+            comm_dst = group_list[dst];
+            send_offset = exchange_node->payload_info[i][j].s_offset * pack_len;
+            send_len = exchange_node->payload_info[i][j].s_len * pack_len;
+            /* debug print */
+            /* fprintf(stderr,"sending %d bytes to rank %d at offset %d\n",send_len, */
+            /*         comm_dst,send_offset); */
+            rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer +
+                            send_offset),
+                            send_len,
+                            MPI_BYTE,
+                            comm_dst, tag,
+                            MCA_PML_BASE_SEND_STANDARD, comm,
+                            &(requests[*active_requests])));
+
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10,("Failed to isend data"));
+                return OMPI_ERROR;
+            }
+            ++(*active_requests);
+
+            /* sends are posted */
+        }
+
+        /* Now post the recv's */
+        for( j = 0; j < (tree_order - 1); j++ ) {
+
+            /* recv phase */
+            src = exchange_node->rank_exchanges[i][j];
+            if( src < 0 ) {
+                continue;
+            }
+            comm_src = group_list[src];
+            recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
+            recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
+            /* debug print */
+            /* fprintf(stderr,"recving %d bytes to rank %d at offset %d\n",recv_len, */
+            /*         comm_src,recv_offset); */
+            /* post the receive */
+            rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer +
+                            recv_offset),
+                            recv_len,
+                            MPI_BYTE,
+                            comm_src,
+                            tag, comm, &(requests[*active_requests])));
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
+                return OMPI_ERROR;
+            }
+
+            ++(*active_requests);
+        }
+        /* finished all send/recv's now poll for completion before
+         * continuing to next iteration
+         */
+        completed = 0;
+        /* polling internally on 2*(k - 1) requests */
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+
+        if(!completed){
+            /* save state and hop out
+             * only the iteration needs to be tracked
+             */
+            *iteration = i; /* need to pick up here */
+
+            return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+    }
+
+    /* finish off the last piece, send the data back to the extra  */
+    if( 0 < exchange_node->n_extra_sources ) {
+        dst = exchange_node->rank_extra_sources_array[0];
+        comm_dst = group_list[dst];
+        knt = 0;
+        for( i = 0; i < group_size; i++){
+            knt += list_connected[i];
+        }
+        /* debug print */
+        /*
+        fprintf(stderr,"sending %d bytes to extra %d \n",pack_len*knt,comm_dst);
+        */
+        rc = MCA_PML_CALL(isend(data_buffer,
+                    pack_len * knt,
+                    MPI_BYTE,
+                    comm_dst, tag,
+                    MCA_PML_BASE_SEND_STANDARD, comm,
+                    &(requests[*active_requests])));
+
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10,("Failed to isend data"));
+            return OMPI_ERROR;
+        }
+        ++(*active_requests);
+
+        /* probe for send completion */
+        completed = 0;
+        /* polling internally */
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(!completed){
+            /* save state and hop out
+             * We really do need to block here so set
+             * the iteration to pow_k +1 indicating we need to
+             *  finish progressing the last part
+             */
+            *iteration = pow_k + 1;
+
+            return (OMPI_SUCCESS != rc ? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+    }
+
+FINISHED:
+    /* recycle buffer if need be */
+    return BCOL_FN_COMPLETE;
+}
+
+/* allgather progress function */
+
+int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args,
+                        struct coll_ml_function_t *const_args)
+{
+
+
+    /* local variables */
+
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
+    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
+    netpatterns_k_exchange_node_t *exchange_node = &ptpcoll_module->knomial_allgather_tree;
+    int group_size = ptpcoll_module->group_size;
+    int *list_connected = ptpcoll_module->super.list_n_connected; /* critical for hierarchical colls */
+
+
+    int tag;
+    int i, j;
+    int knt;
+    int comm_src, comm_dst, src, dst;
+    int recv_offset, recv_len;
+    int send_offset, send_len;
+    uint32_t buffer_index = input_args->buffer_index;
+
+    int pow_k, tree_order;
+    int rc = OMPI_SUCCESS;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    ompi_request_t **requests =
+        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int completed = 0; /* initialized */
+    void *data_buffer = (void*)(
+            (unsigned char *) input_args->sbuf +
+            (size_t) input_args->sbuf_offset);
+    int pack_len = input_args->count * input_args->dtype->super.size;
+    /* initialize the counter */
+    int *iteration = &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
+
+
+#if 0
+    fprintf(stderr,"%d: entering p2p allgather progress AR: %d iter: %d\n",my_group_index,*active_requests,
+            *iteration);
+#endif
+    /* keep tag within the limit supported by the pml */
+    tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
+    /* mark this as a collective tag, to avoid conflict with user-level flags */
+    tag = -tag;
+
+    /* k-nomial tree parameters */
+    tree_order = exchange_node->tree_order;
+    pow_k = exchange_node->log_tree_order;
+
+    /* let's begin the collective, starting with extra ranks and their
+     * respective proxies
+     */
+    if( EXTRA_NODE == exchange_node->node_type ) {
+
+        /* debug print */
+        /*fprintf(stderr,"666 \n");*/
+        /* simply poll for completion */
+        completed = 0;
+        /* polling internally */
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(completed){
+            /* go to buffer release */
+            goto FINISHED;
+        }else{
+            /* save state and hop out
+             * nothing to save here
+             */
+            return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+    }else if ( 0 < exchange_node->n_extra_sources && (-1 == *iteration)) {
+
+        /* I am a proxy for someone */
+        /* Simply poll for completion */
+        completed = 0;
+        /* polling internally */
+        assert( 1 == *active_requests);
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(!completed){
+            /* save state and hop out
+             * We really do need to block here so set
+             * the iteration to -1 indicating we need to
+             *  finish this part first
+             */
+            (*iteration) = -1;
+            return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+        /* I may now proceed to the recursive k - ing phase */
+        *iteration = 0;
+    }
+
+
+    /* the ordering here between the extra rank and progress active requests
+     * is critical
+     */
+    /* extra rank */
+    if( (pow_k + 1) == *iteration ){
+        /* finish off the last one */
+        goto PROGRESS_EXTRA;
+    }
+
+    /* active requests must be completed before continuing on to
+     * recursive k -ing step
+     * CAREFUL HERE, IT THIS REALLY WHAT YOU WANT??
+     */
+    if( 0 < (*active_requests) ) {
+        /* then we have something to progress from last step */
+        /* debug print */
+        /*
+        fprintf(stderr,"%d: entering progress AR: %d iter: %d\n",my_group_index,*active_requests,
+            *iteration);
+        */
+        completed = 0;
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(!completed){
+            /* save state and hop out
+             * state hasn't changed
+             */
+
+            return ((MPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+        ++(*iteration);
+    }
+
+
+
+    /* we start the recursive k - ing phase */
+    for( i = *iteration; i < pow_k; i++) {
+        /* nothing changes here */
+        for(j = 0; j < (tree_order - 1); j++) {
+
+            /* send phase */
+            dst = exchange_node->rank_exchanges[i][j];
+            if( dst < 0 ){
+                continue;
+            }
+            comm_dst = group_list[dst];
+            send_offset = exchange_node->payload_info[i][j].s_offset * pack_len;
+            send_len = exchange_node->payload_info[i][j].s_len * pack_len;
+            rc = MCA_PML_CALL(isend((void*)((unsigned char *) data_buffer +
+                            send_offset),
+                            send_len,
+                            MPI_BYTE,
+                            comm_dst, tag,
+                            MCA_PML_BASE_SEND_STANDARD, comm,
+                            &(requests[*active_requests])));
+
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10,("Failed to isend data"));
+                return OMPI_ERROR;
+            }
+            ++(*active_requests);
+
+            /* sends are posted */
+        }
+
+        /* Now post the recv's */
+        for( j = 0; j < (tree_order - 1); j++ ) {
+
+            /* recv phase */
+            src = exchange_node->rank_exchanges[i][j];
+            if( src < 0 ) {
+                continue;
+            }
+            comm_src = group_list[src];
+            recv_offset = exchange_node->payload_info[i][j].r_offset * pack_len;
+            recv_len = exchange_node->payload_info[i][j].r_len * pack_len;
+            /* post the receive */
+            rc = MCA_PML_CALL(irecv((void *) ((unsigned char *) data_buffer +
+                            recv_offset),
+                            recv_len,
+                            MPI_BYTE,
+                            comm_src,
+                            tag, comm, &(requests[*active_requests])));
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10, ("Failed to post ireceive "));
+                return OMPI_ERROR;
+            }
+
+            ++(*active_requests);
+        }
+        /* finished all send/recv's now poll for completion before
+         * continuing to next iteration
+         */
+        completed = 0;
+        /* make this non-blocking */
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(!completed){
+            /* save state and hop out
+             * We really do need to block here so set
+             * the iteration to -1 indicating we need to
+             *  finish this part first
+             */
+            *iteration = i; /* need to pick up here */
+
+            return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
+        }
+    }
+
+    /* finish off the last piece, send the data back to the extra  */
+    if( 0 < exchange_node->n_extra_sources ) {
+        dst = exchange_node->rank_extra_sources_array[0];
+        comm_dst = group_list[dst];
+        knt = 0;
+        for( i = 0; i < group_size; i++){
+            knt += list_connected[i];
+        }
+        rc = MCA_PML_CALL(isend(data_buffer,
+                    pack_len * knt,
+                    MPI_BYTE,
+                    comm_dst, tag,
+                    MCA_PML_BASE_SEND_STANDARD, comm,
+                    &(requests[*active_requests])));
+
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10,("Failed to isend data"));
+            return OMPI_ERROR;
+        }
+        ++(*active_requests);
+
+        /* probe for send completion */
+        completed = 0;
+        /* make this non-blocking */
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        if(!completed){
+            /* save state and hop out
+             * We really do need to block here so set
+             * the iteration to pow_k +1 indicating we need to
+             *  finish progressing the last part
+             */
+            *iteration = pow_k + 1;
+
+            return ((OMPI_SUCCESS != rc) ? OMPI_ERROR :  BCOL_FN_STARTED);
+        }
+    }
+    /* folks need to skip this unless they really are the proxy
+     * reentering with the intent of progressing the final send
+     */
+    goto FINISHED;
+
+PROGRESS_EXTRA:
+
+    /* probe for send completion */
+    completed = 0;
+    /* make this non-blocking */
+    completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+    if(!completed){
+        /* save state and hop out
+         * We really do need to block here so set
+         * the iteration to pow_k +1 indicating we need to
+         *  finish progressing the last part
+         */
+
+        return ((OMPI_SUCCESS != rc) ? OMPI_ERROR : BCOL_FN_STARTED);
+    }
+
+FINISHED:
+    /* recycle buffer if need be */
+    return BCOL_FN_COMPLETE;
+}
+
+/*
+ * Register allreduce functions to the BCOL function table,
+ * so they can be selected
+ */
+int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    comm_attribs.bcoll_type = BCOL_ALLGATHER;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 1024 * 1024;
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000; /* range 1 */
+
+    inv_attribs.datatype_bitmap = 0xffffffff;
+    inv_attribs.op_types_bitmap = 0xffffffff;
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                bcol_ptpcoll_k_nomial_allgather_init,
+                bcol_ptpcoll_k_nomial_allgather_progress);
+
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+    inv_attribs.bcol_msg_min = 10000000;
+    inv_attribs.bcol_msg_max = 10485760; /* range 4 */
+
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                bcol_ptpcoll_k_nomial_allgather_init,
+                bcol_ptpcoll_k_nomial_allgather_progress);
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.c
new file mode 100644
index 0000000000..d0087ea774
--- /dev/null
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.c
@@ -0,0 +1,1030 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "bcol_ptpcoll_allreduce.h"
+
+/*
+ * Recursive K-ing allreduce
+ */
+static inline int bcol_ptpcoll_allreduce_narray_schedule_extra_node_exchange (mca_bcol_ptpcoll_module_t *ptpcoll_module, netpatterns_k_exchange_node_t *k_node,
+                                                                              void *data_buffer, size_t data_size, ompi_request_t **requests, int *active_requests,
+                                                                              int tag)
+{
+    ompi_communicator_t *comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    int peer_comm_rank, k, offset, rc;
+
+    if (EXCHANGE_NODE == k_node->node_type) {
+        /* the send data resides in the first part of the buffer */
+        for (k = 0, offset = data_size ; k < k_node->n_extra_sources ; ++k, offset += data_size) {
+            peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[k]];
+
+            PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p len %d tag %d",
+                                 peer_comm_rank, data_buffer, data_size, tag));
+            rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + offset),
+                                    data_size, MPI_BYTE, peer_comm_rank, tag, comm,
+                                    &requests[*active_requests]));
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10, ("Failed to receive data"));
+                return OMPI_ERROR;
+            }
+
+            ++(*active_requests);
+        }
+    } else {
+        peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[0]];
+
+        PTPCOLL_VERBOSE(10, ("Send data to %d, addr %p len %d tag %d",
+                             peer_comm_rank, data_buffer, data_size, tag));
+
+        rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer_comm_rank,
+                                tag, MCA_PML_BASE_SEND_STANDARD, comm,
+                                &(requests[*active_requests])));
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10, ("Failed to send data"));
+            return OMPI_ERROR;
+        }
+
+        ++(*active_requests);
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static inline void bcol_ptpcoll_allreduce_narray_reduce (void *data_buffer, struct ompi_datatype_t *data_type, int count, struct ompi_op_t *op, int sources)
+{
+    size_t data_size = mca_bcol_base_get_buff_length(data_type, count);
+
+    for (int k = 0, offset = data_size ; k < sources ; ++k, offset += data_size) {
+        ompi_op_reduce(op, (char *) data_buffer + offset, data_buffer, count, data_type);
+    }
+}
+
+static int bcol_ptpcoll_allreduce_narraying_progress (bcol_function_args_t *input_args,
+                                                      struct coll_ml_function_t *const_args)
+{
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *) const_args->bcol_module;
+    void *data_buffer = (void *) ( (unsigned char *) input_args->sbuf +
+                                   (size_t) input_args->sbuf_offset);
+    struct ompi_datatype_t *data_type = input_args->dtype;
+    uint32_t buffer_index = input_args->buffer_index;
+    struct ompi_op_t *op = input_args->op;
+    int count = input_args->count;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
+    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
+    int k, rc, peer, group_peer;
+    int offset = 0;
+    ompi_communicator_t *comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    ompi_request_t **requests =
+        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
+
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int k_radix = k_node->tree_order;
+
+    size_t data_size = mca_bcol_base_get_buff_length(data_type, count);
+    int *iteration =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration);
+
+    /* if we are just staring the collective and there are extra sources then schedule the
+     * extra node exchange. otherwise check if the exchange is complete. */
+    if (-1 == *iteration) {
+        if (0 < k_node->n_extra_sources) {
+            if (!(*active_requests)) {
+                rc = bcol_ptpcoll_allreduce_narray_schedule_extra_node_exchange (ptpcoll_module, k_node, data_buffer, data_size,
+                                                                             requests, active_requests, tag);
+                if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+                    return rc;
+                }
+            }
+
+            /* check for extra node exchange completion */
+            if (!mca_bcol_ptpcoll_test_all_for_match (active_requests, requests, &rc)) {
+                return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc;
+            }
+
+            if (EXCHANGE_NODE == k_node->node_type) {
+                bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_node->n_extra_sources);
+            }
+        }
+
+        /* start recursive k-ing */
+        *iteration = 0;
+    }
+
+    if (*iteration < k_node->n_exchanges) {
+        if (*active_requests) {
+            if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) {
+                return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc;
+            }
+
+            ++(*iteration);
+            bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_radix - 1);
+        }
+    }
+
+    for ( ; *iteration < k_node->n_exchanges ; ++(*iteration)) {
+        for (k = 0; k < k_radix - 1; k++) {
+            group_peer = k_node->rank_exchanges[*iteration][k];
+
+            peer = group_list[group_peer];
+
+            PTPCOLL_VERBOSE(10, ("Send data to %d, addr %p len %d tag %d",
+                                 peer, data_buffer, data_size, tag));
+            rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer, tag,
+                                    MCA_PML_BASE_SEND_STANDARD, comm,
+                                    &(requests[*active_requests])));
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10, ("Failed to send data"));
+                return OMPI_ERROR;
+            }
+
+            ++(*active_requests);
+        }
+
+        for (k = 0, offset = data_size ; k < k_radix - 1 ; ++k, offset += data_size) {
+            group_peer = k_node->rank_exchanges[*iteration][k];
+            peer = group_list[group_peer];
+
+            PTPCOLL_VERBOSE(10, ("Recv data from %d, addr %p len %d tag %d",
+                                 peer, data_buffer, data_size, tag));
+            rc = MCA_PML_CALL(irecv((void *)((unsigned char *)data_buffer + offset ),
+                                    data_size, MPI_BYTE, peer, tag, comm,
+                                    &requests[*active_requests]));
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10, ("Failed to receive data"));
+                return OMPI_ERROR;
+            }
+
+            ++(*active_requests);
+        }
+
+        if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) {
+            return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc;
+        }
+
+        bcol_ptpcoll_allreduce_narray_reduce (data_buffer, data_type, count, op, k_radix - 1);
+    }
+
+    /* ensure extra nodes get the result */
+    if (0 < k_node->n_extra_sources)  {
+        if (!(*active_requests)) {
+            int peer_comm_rank;
+
+            if (EXTRA_NODE == k_node->node_type) {
+                peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[0]];
+
+                PTPCOLL_VERBOSE(10, ("EXTRA_NODE: Recv data from %d, addr %p len %d tag %d",
+                                     peer_comm_rank, data_buffer, data_size, tag));
+                rc = MCA_PML_CALL(irecv(data_buffer, data_size, MPI_BYTE, peer_comm_rank,
+                                        tag, comm, &requests[*active_requests]));
+                if( OMPI_SUCCESS != rc ) {
+                    PTPCOLL_VERBOSE(10, ("Failed to receive data"));
+                    return OMPI_ERROR;
+                }
+
+                ++(*active_requests);
+            } else {
+                for (k = 0; k < k_node->n_extra_sources; k++) {
+                    peer_comm_rank = ptpcoll_module->super.sbgp_partner_module->group_list[k_node->rank_extra_sources_array[k]];
+
+                    PTPCOLL_VERBOSE(10, ("EXCHANGE_NODE: Send data to %d, addr %p len %d tag %d",
+                                         peer_comm_rank, data_buffer, data_size, tag));
+                    rc = MCA_PML_CALL(isend(data_buffer, data_size, MPI_BYTE, peer_comm_rank,
+                                            tag, MCA_PML_BASE_SEND_STANDARD, comm,
+                                            &(requests[*active_requests])));
+
+                    if( OMPI_SUCCESS != rc ) {
+                        PTPCOLL_VERBOSE(10, ("Failed to send data"));
+                        return OMPI_ERROR;
+                    }
+
+                    ++(*active_requests);
+                }
+            }
+        }
+
+        if (!mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc)) {
+            return (OMPI_SUCCESS == rc) ? BCOL_FN_STARTED : rc;
+        }
+    }
+
+    return BCOL_FN_COMPLETE;
+}
+
+int bcol_ptpcoll_allreduce_narraying_init(bcol_function_args_t *input_args,
+                                          struct coll_ml_function_t *const_args){
+
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
+    uint64_t sequence_number = input_args->sequence_num;
+    uint32_t buffer_index = input_args->buffer_index;
+    int count = input_args->count;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    size_t buffer_size;
+    int tag;
+
+    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag;
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1;
+
+    /* start with extra node exchange if needed */
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration = -1;
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests = 0;
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status = PTPCOLL_NOT_STARTED;
+
+    /*
+     * ML bufer is segmented into k segments and each of the k segment is used
+     * for reductions
+     */
+    /* This has to be based on ml buffer size. Need to take into account the space used
+     * by the headers of other bcol modules. */
+    buffer_size  = ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX;
+    assert(buffer_size >= count * dtype->super.size *
+           ptpcoll_module->k_nomial_radix);
+
+    return bcol_ptpcoll_allreduce_narraying_progress (input_args, const_args);
+}
+
+static inline int compute_seg_index(int peer, int kpow_num, int tree_order) {
+
+    int peer_base, peer_position, peer_base_rank, peer_index;
+
+    peer_base = peer / (kpow_num * tree_order);
+    peer_base_rank = peer_base * kpow_num * tree_order ;
+    peer_position = peer_base_rank == 0 ? peer : peer % (peer_base_rank);
+    peer_index    = peer_position / kpow_num ;
+
+    return peer_index;
+}
+
+int compute_knomial_allgather_offsets(int group_index, int count, struct
+                                      ompi_datatype_t *dtype,int k_radix,int n_exchanges,
+                                      int **offsets){
+
+    int modulo_group_size;
+    size_t seg_count, seg_size, seg_index, seg_offset;
+    size_t block_offset, block_count;
+    int exchange_step;
+    ptrdiff_t lb, extent;
+
+    if (0 >= n_exchanges) {
+        PTPCOLL_VERBOSE(10,("Nothing to initialize "));
+        return 0;
+    }
+    modulo_group_size = 1;
+    seg_count = count / k_radix;
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+    seg_size = seg_count * extent;
+
+    seg_index = group_index % k_radix;
+    seg_offset = seg_index * seg_size;
+
+    offsets[0][BLOCK_OFFSET] = block_offset = 0;
+    offsets[0][BLOCK_COUNT] = block_count = count;
+    offsets[0][LOCAL_REDUCE_SEG_OFFSET] = seg_offset;
+    offsets[0][SEG_SIZE] = seg_size;
+
+
+    for(exchange_step = 1; exchange_step < n_exchanges; exchange_step++) {
+
+        /* Previous step's segment is this exchange step's block */
+        block_count = seg_count;
+        block_offset = seg_offset;
+
+        /* Divide the segment into k parts */
+        seg_count = seg_count / k_radix;
+        seg_size = seg_count * extent;
+
+        /* Among different segments in block, which segment should I reduce ? */
+        /* For allgather phase, I will not send out this segment to peers */
+        modulo_group_size *= k_radix;
+        seg_index = compute_seg_index(group_index, modulo_group_size, k_radix);
+        seg_offset = seg_index * seg_size;
+
+
+        offsets[exchange_step][BLOCK_OFFSET] = block_offset;
+        offsets[exchange_step][LOCAL_REDUCE_SEG_OFFSET] = seg_offset;
+        offsets[exchange_step][BLOCK_COUNT] = block_count;
+        offsets[exchange_step][SEG_SIZE] = seg_size;
+
+        /* Change to absolute offset */
+        seg_offset = block_offset + seg_offset;
+
+    }
+
+    return 0;
+}
+
+static inline int compute_send_segment_size(int block_offset,
+                                            int send_offset,
+                                            int segment_size,
+                                            int padded_offset) {
+    int send_size = -1;
+    /* segment to be sent starts here */
+    int segment_offset = block_offset + send_offset ;
+    send_size = (segment_offset + segment_size) >= padded_offset ?
+        segment_size - (segment_offset + segment_size - padded_offset) : segment_size;
+    return send_size;
+}
+
+static inline int compute_recv_segment_size(int block_offset,
+                                            int recv_offset,
+                                            int segment_size,
+                                            int padded_offset) {
+    int recv_size = -1;
+    /* segment to be sent starts here */
+    int segment_offset = block_offset + recv_offset ;
+    recv_size = (segment_offset + segment_size) >= padded_offset ?
+        segment_size - (segment_offset + segment_size - padded_offset) : segment_size;
+
+    return recv_size;
+}
+
+/*
+ *
+ * K-nomial Reduce Scatter
+ * Example k=3 n=9
+ *
+ * | ABCDEFGH |0|
+ *
+ * Number of Exchange steps = log (basek) n
+ * Number of steps in exchange step = k (radix)
+ *
+ * block_size = Size of data that is reduce in exchange step
+ * segment_size = Size of data that is send or received by rank in radix step
+ *
+ * block_size = segment_size * k
+ *
+ * my_block_start_addr = Address of the segment in the block where I reference my
+ * offsets
+ *
+ * This is version 1 : Experimenting with decoupling offset calcuations
+ */
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+                                                     const int buffer_index, void *sbuf,
+                                                     void *rbuf,
+                                                     struct ompi_op_t *op,
+                                                     const int count, struct ompi_datatype_t *dtype,
+                                                     const int relative_group_index,
+                                                     const int padded_start_byte){
+    int blocks_in_step =
+        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask;
+    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
+    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    mca_bcol_ptpcoll_component_t *cm =
+        &mca_bcol_ptpcoll_component;
+    void *my_block_start_addr = NULL, *my_block_addr = NULL;
+    int i, k, group_peer, peer ;
+    int k_radix = k_node->tree_order;
+    int rc = OMPI_SUCCESS;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    ompi_request_t **requests =
+        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int completed;
+    void *my_recv_start_addr, *my_recv_addr;
+    size_t block_offset, reduce_seg_offset, send_offset, recv_offset;
+    int seg_size, block_size;
+    int block_count, seg_count;
+    ptrdiff_t lb, extent;
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+
+    my_recv_start_addr = rbuf;
+    my_block_start_addr = sbuf;
+    block_count = count;
+    block_size = count * extent;
+
+
+    for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
+         i < k_node->n_exchanges; i++, blocks_in_step *= cm->narray_knomial_radix) {
+
+        block_offset = ptpcoll_module->allgather_offsets[i][BLOCK_OFFSET];
+        reduce_seg_offset = ptpcoll_module->allgather_offsets[i][LOCAL_REDUCE_SEG_OFFSET];
+        block_count = ptpcoll_module->allgather_offsets[i][BLOCK_COUNT];
+        seg_size = ptpcoll_module->allgather_offsets[i][SEG_SIZE];
+        block_size = block_count * extent;
+
+        PTPCOLL_VERBOSE(10,("Block offset %d, reduce_seg_offset %d, block_count %d seg_size %d",
+                            block_offset, reduce_seg_offset, block_count, seg_size));
+
+        seg_count = block_count / k_radix;
+        my_block_addr = (void*)((char*)my_block_start_addr + block_offset);
+        my_recv_addr = (void*)((char*)my_recv_start_addr + block_offset);
+
+        for (k = 0; k < k_radix - 1; k++) {
+            size_t soffset;
+            int snd_size = 0;
+
+            group_peer = k_node->rank_exchanges[i][k];
+            peer = group_list[group_peer];
+
+            send_offset = reduce_seg_offset + (seg_size * (k + 1));
+
+            if ((int)send_offset + seg_size  > block_size) {
+                send_offset = send_offset % block_size;
+            }
+
+            PTPCOLL_VERBOSE(10, ("Send data to %d,send offset %d len %d",
+                                 peer, send_offset, seg_size));
+
+            soffset = send_offset;
+            snd_size =
+                compute_send_segment_size((int)block_offset,(int)soffset,(int)seg_size,padded_start_byte);
+
+            if (snd_size > 0) {
+                rc = MCA_PML_CALL(isend((void *)((unsigned char *)my_block_addr
+                                                 + soffset),
+                                        snd_size, MPI_BYTE,
+                                        peer, tag, MCA_PML_BASE_SEND_STANDARD, comm,
+                                        &(requests[*active_requests])));
+
+                if( OMPI_SUCCESS != rc ) {
+                    PTPCOLL_VERBOSE(10, ("Failed to send the segment to %d", peer));
+                    return OMPI_ERROR;
+                }
+                ++(*active_requests);
+            }
+
+        }
+
+        /*
+         * Receive the segments to tmp addr and then do a reduction
+         */
+        for (k = 0; k < k_radix - 1; k++) {
+            int recv_size=0;
+
+            group_peer = k_node->rank_exchanges[i][k];
+            peer = group_list[group_peer];
+
+            recv_offset = reduce_seg_offset + (seg_size * (k+1));
+
+            if ((int)recv_offset + seg_size  > block_size) {
+                recv_offset = recv_offset % block_size;
+            }
+
+            PTPCOLL_VERBOSE(10, ("Receive data to receive buffer at offset %d\n",
+                                 recv_offset));
+            recv_size = compute_recv_segment_size((int)block_offset,
+                                                  (int)reduce_seg_offset, (int)seg_size,
+                                                  padded_start_byte);
+
+            if (recv_size > 0 ) {
+                rc = MCA_PML_CALL(irecv((void *)((unsigned char *)
+                                                 my_recv_addr + recv_offset),
+                                        recv_size, MPI_BYTE,
+                                        peer, tag, comm, &requests[*active_requests]));
+                if( OMPI_SUCCESS != rc ) {
+                    PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer));
+                    return OMPI_ERROR;
+                }
+                ++(*active_requests);
+            }
+
+        }
+
+        completed = 0;
+        while(!completed){
+            completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        }
+
+        /* Do a reduction on received buffers */
+        {
+            void *src_data_buffer = NULL, *dst_data_buffer = NULL;
+            int reduce_data_count = 0;
+
+            src_data_buffer = my_block_addr;
+            dst_data_buffer = my_recv_addr;
+
+            for (k = 0; k < k_radix - 1; k++) {
+                recv_offset = reduce_seg_offset + (seg_size * (k+1));
+
+                if ((int)recv_offset + seg_size  > block_size) {
+                    recv_offset = recv_offset % block_size;
+                }
+
+                reduce_data_count = (int)(block_offset + reduce_seg_offset) + seg_size >= padded_start_byte ?
+                    (seg_size - (((int)(block_offset + reduce_seg_offset) + seg_size) - padded_start_byte))/(int)dtype->super.size
+                    : (int)seg_count;
+
+                if (reduce_data_count > 0) {
+                    ompi_3buff_op_reduce(op,
+                                         (void*)((unsigned char*)my_recv_addr + recv_offset),
+                                         (void*)((unsigned char*)src_data_buffer +
+                                                 reduce_seg_offset),
+                                         (void*)((unsigned char*)dst_data_buffer +
+                                                 reduce_seg_offset),
+                                         reduce_data_count,dtype);
+                }
+
+                src_data_buffer = dst_data_buffer;
+
+            }
+        }
+
+        /* After first iteration we have data (to work with) in recv buffer */
+        my_block_start_addr = rbuf;
+
+    }
+
+    return rc;
+}
+
+
+int bcol_ptpcoll_allreduce_knomial_allgather(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+                                             const int buffer_index,
+                                             void *sbuf,void *rbuf, int count, struct
+                                             ompi_datatype_t *dtype,
+                                             const int relative_group_index,
+                                             const int padded_start_byte){
+
+    size_t block_offset = 0, send_offset = 0, recv_offset = 0;
+    int seg_size=0, block_size=0;
+    int i,k,completed;
+    void *my_block_start_addr = rbuf, *my_block_addr;
+    size_t block_count = count;
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int k_radix = k_node->tree_order;
+    int peer, group_peer;
+    int rc = OMPI_SUCCESS;
+    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    ompi_request_t **requests =
+        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    int exchange_step;
+    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
+    ptrdiff_t lb, extent;
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+
+
+    for (i = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration;
+         i < k_node->n_exchanges; i++) {
+
+        exchange_step = k_node->n_exchanges - 1 - i;
+
+        block_offset = ptpcoll_module->allgather_offsets[exchange_step][BLOCK_OFFSET];
+        send_offset = ptpcoll_module->allgather_offsets[exchange_step][LOCAL_REDUCE_SEG_OFFSET];
+        block_count = ptpcoll_module->allgather_offsets[exchange_step][BLOCK_COUNT];
+        seg_size = ptpcoll_module->allgather_offsets[exchange_step][SEG_SIZE];
+        block_size = block_count * extent;
+
+
+        PTPCOLL_VERBOSE(10, ("Send offset %d block_offset %d seg_size %\n",
+                             send_offset, block_offset, seg_size));
+
+        my_block_addr = (void*)((unsigned char*)my_block_start_addr + block_offset);
+
+        for (k = 0; k < k_radix - 1; k++) {
+            size_t soffset=0; int snd_size = 0;
+            group_peer = k_node->rank_exchanges[exchange_step][k];
+            peer = group_list[group_peer];
+
+            soffset = send_offset;
+            snd_size = compute_send_segment_size((int)block_offset,
+                                                 (int)soffset,
+                                                 (int)seg_size,
+                                                 padded_start_byte);
+            if (snd_size > 0) {
+                rc = MCA_PML_CALL(isend((void *)((unsigned char *)my_block_addr
+                                                 + soffset),
+                                        snd_size, MPI_BYTE,
+                                        peer, tag, MCA_PML_BASE_SEND_STANDARD, comm,
+                                        &(requests[*active_requests])));
+
+                if( OMPI_SUCCESS != rc ) {
+                    PTPCOLL_VERBOSE(10, ("Failed to send the segment to %d", peer));
+                    return OMPI_ERROR;
+                }
+
+                ++(*active_requests);
+            }
+
+            PTPCOLL_VERBOSE(10, ("Send data to receive buffer at offset %d to %d\n",
+                                 send_offset, peer));
+        }
+
+        for (k = 0; k < k_radix - 1; k++) {
+            int recv_size=0;
+
+            group_peer = k_node->rank_exchanges[exchange_step][k];
+            peer = group_list[group_peer];
+
+            recv_offset = send_offset + (k + 1) * seg_size;
+
+            if ((int)recv_offset + seg_size > block_size){
+                recv_offset = recv_offset % block_size;
+            }
+
+            PTPCOLL_VERBOSE(10, ("Receive data to receive buffer at offset %d from %d\n",
+                                 recv_offset, peer));
+
+
+            recv_size = compute_recv_segment_size((int)block_offset,
+                                                  (int)recv_offset,
+                                                  (int)seg_size,
+                                                  padded_start_byte);
+            if (recv_size > 0) {
+                rc = MCA_PML_CALL(irecv((void *)((unsigned char *)
+                                                 my_block_addr + recv_offset),
+                                        recv_size, MPI_BYTE,
+                                        peer, tag, comm, &requests[*active_requests]));
+
+                if( OMPI_SUCCESS != rc ) {
+                    PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer));
+                    return OMPI_ERROR;
+                }
+                ++(*active_requests);
+            }
+
+        }
+
+        completed = 0;
+        while(!completed){
+            completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+        }
+
+        block_count = block_count * k_radix;
+        block_size = block_count * extent;
+
+    }
+
+    return rc;
+
+}
+
+static inline int compute_padding_count(int count, int k_radix, int n_exchanges){
+    bool fpadding = false;
+    size_t dsize;
+    int i, pad_count=0, kpow;
+
+    /* is padding required */
+    dsize = count;
+    kpow = 1;
+    for ( i=0; i < n_exchanges; i++) {
+        if (dsize % k_radix) {
+            fpadding = true;
+        }
+        dsize /= k_radix;
+        kpow *= k_radix;
+    }
+
+    if (fpadding) {
+        pad_count = count % kpow;
+        pad_count = kpow - pad_count;
+    }
+
+    return pad_count;
+}
+
+
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init(bcol_function_args_t *input_args,
+                                                                    struct coll_ml_function_t *const_args){
+
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
+    struct ompi_op_t *op = input_args->op;
+    int tag;
+    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
+    uint64_t sequence_number = input_args->sequence_num;
+    uint32_t buffer_index = input_args->buffer_index;
+    void *src_buffer = (void *) (
+        (unsigned char *)input_args->sbuf +
+        (size_t)input_args->sbuf_offset);
+
+    void *recv_buffer = (void *) (
+        (unsigned char *)input_args->rbuf +
+        (size_t)input_args->rbuf_offset);
+
+    int count = input_args->count;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    int *iteration =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration);
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int *status =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status);
+    ptrdiff_t lb, extent;
+
+    /* Get the knomial tree */
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int k_radix = k_node->tree_order;
+    int n_exchanges = k_node->n_exchanges;
+    int padded_start_byte;
+    int padding_count = compute_padding_count(count, k_radix, n_exchanges);
+
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+    padded_start_byte = count * extent;
+
+
+    /* Init for making the functions Re-entrant */
+    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag;
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1;
+    *active_requests = 0;
+    *iteration = -1;
+    *status = PTPCOLL_NOT_STARTED;
+    *iteration = 0;
+
+    compute_knomial_allgather_offsets(my_group_index,count + padding_count, dtype,k_radix,n_exchanges,
+                                      ptpcoll_module->allgather_offsets);
+
+    /* Perform a recursive k'ing reduce scatter */
+    bcol_ptpcoll_allreduce_recursivek_scatter_reduce(ptpcoll_module, buffer_index,
+                                                     src_buffer, recv_buffer, op, count + padding_count, dtype,
+                                                     my_group_index,padded_start_byte);
+
+
+    /* Perform a recursive k'ing allgather */
+    bcol_ptpcoll_allreduce_knomial_allgather(ptpcoll_module,
+                                             buffer_index,
+                                             src_buffer, recv_buffer, count + padding_count, dtype,
+                                             my_group_index, padded_start_byte);
+
+    return BCOL_FN_COMPLETE;
+}
+
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+                                                           int buffer_index,
+                                                           void *sbuf,
+                                                           void *rbuf,
+                                                           struct ompi_op_t *op,
+                                                           const int count, struct ompi_datatype_t *dtype){
+    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int k, peer ;
+    int rc = OMPI_SUCCESS;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    int block_count, block_size;
+    char *tmprecv_buffer = NULL, *data_src_buffer, *data_dst_buffer;
+    ptrdiff_t lb, extent;
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+
+    block_count = count;
+    block_size = count * extent;
+
+
+    if (0 < block_size) {
+        tmprecv_buffer = (void*)malloc(block_size);
+    }
+
+    data_src_buffer = sbuf;
+    data_dst_buffer = rbuf;
+
+    if (EXCHANGE_NODE == k_node->node_type) {
+        for (k = 0; k < k_node->n_extra_sources; k++){
+
+            peer = ptpcoll_module->super.sbgp_partner_module->group_list[
+                k_node->rank_extra_sources_array[k]];
+
+            rc = MCA_PML_CALL(recv((void *)((unsigned char *)tmprecv_buffer),
+                                   block_size, MPI_BYTE,
+                                   peer, tag, comm, MPI_STATUS_IGNORE));
+
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10, ("Failed to receive the segment from %d", peer));
+                rc = OMPI_ERROR;
+                goto clean;
+            }
+
+            ompi_3buff_op_reduce(op, (void*)((unsigned char*)data_src_buffer),
+                                 (void*)((unsigned char*)tmprecv_buffer),
+                                 (void*)((unsigned char*)data_dst_buffer),
+                                 block_count,dtype);
+            data_src_buffer = data_dst_buffer;
+        }
+    } else {
+        peer = ptpcoll_module->super.sbgp_partner_module->group_list[
+            k_node->rank_extra_sources_array[0]];
+
+        rc = MCA_PML_CALL(send((void *)((unsigned char *)sbuf),
+                               block_size, MPI_BYTE,
+                               peer, tag, MCA_PML_BASE_SEND_STANDARD, comm));
+
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10, ("Failed to send data"));
+            rc = OMPI_ERROR;
+            goto clean;
+        }
+    }
+
+clean:
+    if (tmprecv_buffer) {
+        free(tmprecv_buffer);
+    }
+    return rc;
+}
+
+int bcol_ptpcoll_allreduce_knomial_allgather_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+                                                   int buffer_index,
+                                                   void *sbuf,
+                                                   void *rbuf,
+                                                   const int count, struct ompi_datatype_t *dtype){
+    int tag = ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag - 1;
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int k, peer ;
+    int rc = OMPI_SUCCESS;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    int block_size, completed;
+    ompi_request_t **requests =
+        ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    ptrdiff_t lb, extent;
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+
+
+    block_size = count * extent;
+
+    if (EXTRA_NODE == k_node->node_type) {
+        peer = ptpcoll_module->super.sbgp_partner_module->group_list[
+            k_node->rank_extra_sources_array[0]];
+
+        rc = MCA_PML_CALL(irecv((void *)((unsigned char *)rbuf),
+                                block_size, MPI_BYTE,
+                                peer, tag, comm, &requests[*active_requests]));
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10, ("Failed to receive data"));
+            return OMPI_ERROR;
+        }
+
+        ++(*active_requests);
+    } else {
+        for (k = 0; k < k_node->n_extra_sources; k++) {
+            peer = ptpcoll_module->super.sbgp_partner_module->group_list[
+                k_node->rank_extra_sources_array[k]];
+
+            rc = MCA_PML_CALL(isend((void *)((unsigned char *)rbuf),
+                                    block_size, MPI_BYTE,
+                                    peer, tag, MCA_PML_BASE_SEND_STANDARD, comm,
+                                    &(requests[*active_requests])));
+
+            if( OMPI_SUCCESS != rc ) {
+                PTPCOLL_VERBOSE(10, ("Failed to send data"));
+                return OMPI_ERROR;
+            }
+
+            ++(*active_requests);
+        }
+
+    }
+
+    completed = 0;
+
+    while(!completed){
+        completed = mca_bcol_ptpcoll_test_all_for_match(active_requests, requests, &rc);
+    }
+
+    return rc;
+}
+
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init(bcol_function_args_t *input_args,
+                                                                          struct coll_ml_function_t *const_args){
+
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
+    struct ompi_op_t *op = input_args->op;
+    int tag;
+    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
+    uint64_t sequence_number = input_args->sequence_num;
+    uint32_t buffer_index = input_args->buffer_index;
+    void *src_buffer = (void *) (
+        (unsigned char *)input_args->sbuf +
+        (size_t)input_args->sbuf_offset);
+
+    void *recv_buffer = (void *) (
+        (unsigned char *)input_args->rbuf +
+        (size_t)input_args->rbuf_offset);
+
+    int count = input_args->count;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    int *iteration =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].iteration);
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int *status =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].status);
+    ptrdiff_t lb, extent;
+    /* Get the knomial tree */
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int k_radix = k_node->tree_order;
+    int n_exchanges = k_node->n_exchanges;
+    int padded_start_byte;
+    int padding_count = compute_padding_count(count, k_radix, n_exchanges);
+    void *tmpsrc_buffer = NULL;
+
+    ompi_datatype_get_extent(dtype, &lb, &extent);
+    padded_start_byte = count * extent;
+
+    /* Init for making the functions Re-entrant */
+    tag = (PTPCOLL_TAG_OFFSET + sequence_number * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag = tag = -tag;
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].radix_mask = 1;
+    *active_requests = 0;
+    *iteration = -1;
+    *status = PTPCOLL_NOT_STARTED;
+    *iteration = 0;
+
+    compute_knomial_allgather_offsets(my_group_index,count + padding_count, dtype,k_radix,n_exchanges,
+                                      ptpcoll_module->allgather_offsets);
+
+    if (EXCHANGE_NODE == k_node->node_type) {
+        bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(ptpcoll_module,
+                                                               buffer_index,
+                                                               src_buffer, recv_buffer, op, count, dtype);
+        tmpsrc_buffer = src_buffer;
+        if ( k_node->n_extra_sources > 0){
+            tmpsrc_buffer = recv_buffer;
+        }
+        bcol_ptpcoll_allreduce_recursivek_scatter_reduce(ptpcoll_module, buffer_index,
+                                                         tmpsrc_buffer, recv_buffer, op, count + padding_count, dtype,
+                                                         my_group_index,padded_start_byte);
+        bcol_ptpcoll_allreduce_knomial_allgather(ptpcoll_module,
+                                                 buffer_index,
+                                                 src_buffer, recv_buffer, count + padding_count, dtype,
+                                                 my_group_index, padded_start_byte);
+        bcol_ptpcoll_allreduce_knomial_allgather_extra(ptpcoll_module,
+                                                       buffer_index,
+                                                       src_buffer, recv_buffer, count, dtype);
+
+    }
+    else if (EXTRA_NODE == k_node->node_type) {
+        bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(ptpcoll_module,
+                                                               buffer_index,
+                                                               src_buffer, recv_buffer, op, count, dtype);
+        bcol_ptpcoll_allreduce_knomial_allgather_extra(ptpcoll_module,
+                                                       buffer_index,
+                                                       src_buffer, recv_buffer, count, dtype);
+    }
+
+    return BCOL_FN_COMPLETE;
+}
+
+
+
+/*
+ * Register allreduce functions to the BCOL function table,
+ * so they can be selected
+ */
+int bcol_ptpcoll_allreduce_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_ptpcoll_module_t *ptpcoll_module =
+        (mca_bcol_ptpcoll_module_t *) super;
+
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    comm_attribs.bcoll_type = BCOL_ALLREDUCE;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 1024 * 1024;
+
+    /* not an accurate attribute, none of these algorithms
+     * are non-blocking
+     */
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000; /* range 1 */
+
+    inv_attribs.datatype_bitmap = 0xffffffff;
+    inv_attribs.op_types_bitmap = 0xffffffff;
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                                 bcol_ptpcoll_allreduce_narraying_init,
+                                 bcol_ptpcoll_allreduce_narraying_progress);
+
+    inv_attribs.bcol_msg_min = 10000000;
+    inv_attribs.bcol_msg_max = 10485760; /* range 4 */
+
+    if (ptpcoll_module->pow_knum == ptpcoll_module->group_size) {
+        mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                                     bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init,
+                                     NULL);
+
+    } else {
+
+        mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                                     bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init,
+                                     NULL);
+
+    }
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.h
new file mode 100644
index 0000000000..600bf338ac
--- /dev/null
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_allreduce.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_BCOL_PTPCOLL_ALLREDUCE_H
+#define MCA_BCOL_PTPCOLL_ALLREDUCE_H
+
+#include "ompi_config.h"
+#include "ompi/op/op.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "bcol_ptpcoll.h"
+#include "bcol_ptpcoll_utils.h"
+
+enum {
+	BLOCK_OFFSET = 0,
+	LOCAL_REDUCE_SEG_OFFSET,
+	BLOCK_COUNT,
+	SEG_SIZE,
+	NOFFSETS
+};
+
+BEGIN_C_DECLS
+int bcol_ptpcoll_allreduce_narraying(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+        const int buffer_index, void *data_buffer,
+		struct ompi_op_t *op,
+		const int count, struct ompi_datatype_t *dtype, const int
+		buffer_size, const int relative_group_index);
+
+
+int bcol_ptpcoll_allreduce_narraying_init(bcol_function_args_t *input_args,
+        struct coll_ml_function_t *const_args);
+
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+						const int buffer_index, void *sbuf,
+					    void *rbuf,
+						struct ompi_op_t *op,
+						const int count, struct ompi_datatype_t *dtype,
+						const int relative_group_index,
+						const int padded_start_byte);
+
+int bcol_ptpcoll_allreduce_knomial_allgather(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+				const int buffer_index,
+				void *sbuf,void *rbuf, int count, struct
+				ompi_datatype_t *dtype,
+				const int relative_group_index,
+				const int padded_start_byte);
+
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_init(bcol_function_args_t *input_args,
+        struct coll_ml_function_t *const_args);
+
+
+int compute_knomial_allgather_offsets(int group_index, int count, struct
+				ompi_datatype_t *dtype,int k_radix,int n_exchanges,
+				int **offsets);
+
+
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+						int buffer_index,
+						void *sbuf,
+					    void *rbuf,
+						struct ompi_op_t *op,
+						const int count, struct ompi_datatype_t *dtype);
+
+int bcol_ptpcoll_allreduce_knomial_allgather_extra(mca_bcol_ptpcoll_module_t *ptpcoll_module,
+						int buffer_index,
+						void *sbuf,
+					    void *rbuf,
+						const int count, struct ompi_datatype_t *dtype);
+
+int bcol_ptpcoll_allreduce_recursivek_scatter_reduce_allgather_extra_init(bcol_function_args_t *input_args,
+        struct coll_ml_function_t *const_args);
+
+int bcol_ptpcoll_allreduce_init(mca_bcol_base_module_t *super);
+
+#if 0
+int knomial_reduce_scatter_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix,
+				int n_exchanges, int nth_exchange, size_t *recv_offset, size_t
+				*block_offset, size_t *block_count, size_t *block_size, size_t
+				*seg_size);
+
+int allgather_offsets(int group_index,int count, struct ompi_datatype_t *dtype, int k_radix,
+				int n_exchanges, int nth_exchange, size_t *send_offset, size_t
+				*block_offset, size_t *block_count, size_t *block_size, size_t
+				*seg_size);
+#endif
+
+END_C_DECLS
+
+#endif
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h
index d8e01ae6d0..6bc9f2d10f 100644
--- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_bcast.h
@@ -453,7 +453,7 @@ int bcol_ptpcoll_bcast_binomial_probe_and_scatter_anyroot(mca_bcol_ptpcoll_modul
     int pow2_distance;
     int my_left_boundary_rank;
     int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
-    int group_root_index;
+    int group_root_index = 0;
     void *curr_data_buffer = NULL;
     int tag = 
         ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].tag;
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_component.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_component.c
index 450802c71b..147adb9d9a 100644
--- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_component.c
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_component.c
@@ -69,6 +69,7 @@ mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component = {
 
             ptpcoll_open,
             ptpcoll_close,
+	    .mca_register_component_params = mca_bcol_ptpcoll_register_mca_params
         },
 
         /* Initialization / querying functions */
@@ -109,14 +110,6 @@ OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_collreq_t,
  */
 static int ptpcoll_open(void)
 {
-    int rc;
-
-    rc = mca_bcol_ptpcoll_register_mca_params();
-    if (OMPI_SUCCESS != rc) {
-        PTPCOLL_VERBOSE(10, ("Failed to register parametres for the component"));
-        return OMPI_ERROR;
-    }
-
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.c
index 6a303c1ab3..ea6dc53dc8 100644
--- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.c
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_mca.c
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -141,12 +144,15 @@ int mca_bcol_ptpcoll_register_mca_params(void)
     CHECK(reg_int("priority", NULL,
                   "PTPCOLL component priority"
                   "(from 0(low) to 90 (high))", 90, &cm->super.priority, 0));
+
     CHECK(reg_int("verbose", NULL,
                   "Output some verbose PTPCOLL information "
                   "(0 = no output, nonzero = output)", 0, &cm->verbose, REGINT_GE_ZERO));
+
     CHECK(reg_int("k_nomial_radix", NULL,
                   "The radix of K-Nomial Tree "
                   "(starts from 2)", 2, &cm->k_nomial_radix, REGINT_GE_ONE));
+
     CHECK(reg_int("narray_radix", NULL,
                   "The radix of Narray Tree "
                   "(starts from 2)", 2, &cm->narray_radix, REGINT_GE_ONE));
@@ -160,9 +166,9 @@ int mca_bcol_ptpcoll_register_mca_params(void)
                   "(starts from 8)", 8, &cm->num_to_probe, REGINT_GE_ONE));
 
     CHECK(reg_int("bcast_small_msg_known_root_alg", NULL,
-                "Algoritm selection for bcast small messages known root"
-                "(1 - K-nomial, 2 - N-array)", 1, &cm->bcast_small_messages_known_root_alg,
-                REGINT_GE_ZERO));
+                  "Algoritm selection for bcast small messages known root"
+                  "(1 - K-nomial, 2 - N-array)", 1, &cm->bcast_small_messages_known_root_alg,
+                  REGINT_GE_ZERO));
 
     CHECK(reg_int("bcast_large_msg_known_root_alg", NULL,
                   "Algoritm selection for bcast large messages known root"
@@ -187,10 +193,5 @@ int mca_bcol_ptpcoll_register_mca_params(void)
                 "User memory can be used by the collective algorithms",
                 1, &cm->super.can_use_user_buffers));
 
-    CHECK(reg_int("use_brucks_smsg_alltoall_rdma", NULL,
-                "Use brucks algorithm for smsg alltoall and RDMA semantics 1 = No Temp buffer recycling"
-                "1 = Alg with no Temp Buffer Recycling (faster), 2 = Alg with temp Buffer Recycling (slower)",
-                0, &cm->use_brucks_smsg_alltoall_rdma, 0));
-
     return ret;
 }
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_module.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_module.c
index e2aa7004d4..3d2c453206 100644
--- a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_module.c
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_module.c
@@ -1,8 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
- * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
- * Copyright (c) 2012      Los Alamos National Security, LLC.
- *                         All rights reserved.
+ * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -34,17 +35,61 @@
 #include "bcol_ptpcoll.h"
 #include "bcol_ptpcoll_utils.h"
 #include "bcol_ptpcoll_bcast.h"
+#include "bcol_ptpcoll_allreduce.h"
+#include "bcol_ptpcoll_reduce.h"
 
 #define BCOL_PTP_CACHE_LINE_SIZE 128
 
 /*
  * Local functions
  */
+static int alloc_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module)
+{
+    int rc = OMPI_SUCCESS, i = 0;
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int n_exchanges = k_node->n_exchanges;
+
+    /* Precalculate the allreduce offsets */
+    if (0 < k_node->n_exchanges) {
+        ptpcoll_module->allgather_offsets = (int **)malloc(n_exchanges * sizeof(int*));
+
+        if (!ptpcoll_module->allgather_offsets) {
+            rc = OMPI_ERROR;
+            return rc;
+        }
+
+        for (i=0; i < n_exchanges ; i++) {
+            ptpcoll_module->allgather_offsets[i] = (int *)malloc (sizeof(int) * NOFFSETS);
+
+            if (!ptpcoll_module->allgather_offsets[i]){
+                rc = OMPI_ERROR;
+                return rc;
+            }
+        }
+    }
+    return rc;
+}
+
+static int free_allreduce_offsets_array(mca_bcol_ptpcoll_module_t *ptpcoll_module)
+{
+    int rc = OMPI_SUCCESS, i = 0;
+    netpatterns_k_exchange_node_t *k_node = &ptpcoll_module->knomial_exchange_tree;
+    int n_exchanges = k_node->n_exchanges;
+
+    if (ptpcoll_module->allgather_offsets) {
+        for (i=0; i < n_exchanges; i++) {
+            free (ptpcoll_module->allgather_offsets[i]);
+        }
+    }
+
+    free(ptpcoll_module->allgather_offsets);
+    return rc;
+}
 
 static void
 mca_bcol_ptpcoll_module_construct(mca_bcol_ptpcoll_module_t *ptpcoll_module)
 {
-    int i;
+    uint64_t i;
     /* Pointer to component */
     ptpcoll_module->super.bcol_component = (mca_bcol_base_component_t *) &mca_bcol_ptpcoll_component;
     ptpcoll_module->super.list_n_connected = NULL;
@@ -56,7 +101,7 @@ mca_bcol_ptpcoll_module_construct(mca_bcol_ptpcoll_module_t *ptpcoll_module)
     /* set the upper limit on the tag */
     i = 2;
     ptpcoll_module->tag_mask = 1;
-    while ( i <= mca_pml.pml_max_tag && i > 0) {
+    while ( i <= (uint64_t) mca_pml.pml_max_tag && i > 0) {
         i <<= 1;
     }
     ptpcoll_module->ml_mem.ml_buf_desc = NULL;
@@ -84,6 +129,9 @@ mca_bcol_ptpcoll_module_destruct(mca_bcol_ptpcoll_module_t *ptpcoll_module)
         free(ml_mem->ml_buf_desc);
     }
 
+    if (NULL != ptpcoll_module->allgather_offsets) {
+        free_allreduce_offsets_array(ptpcoll_module);
+    }
 
     if (NULL != ptpcoll_module->narray_node) {
         for (i = 0; i < ptpcoll_module->group_size; i++) {
@@ -111,7 +159,7 @@ OBJ_CLASS_INSTANCE(mca_bcol_ptpcoll_module_t,
                    mca_bcol_ptpcoll_module_destruct);
 
 static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base_addr, uint32_t num_banks,
-        uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size, int group_size, int pow_k)
+                            uint32_t num_buffers_per_bank, uint32_t size_buffer, uint32_t header_size, int group_size, int pow_k)
 {
     uint32_t i, j, ci;
     mca_bcol_ptpcoll_ml_buffer_desc_t *tmp_desc = NULL;
@@ -124,7 +172,7 @@ static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base
 
 
     *desc = (mca_bcol_ptpcoll_ml_buffer_desc_t *)calloc(num_banks * num_buffers_per_bank,
-            sizeof(mca_bcol_ptpcoll_ml_buffer_desc_t));
+                                                        sizeof(mca_bcol_ptpcoll_ml_buffer_desc_t));
     if (NULL == *desc) {
         PTPCOLL_ERROR(("Failed to allocate memory"));
         return OMPI_ERROR;
@@ -151,6 +199,10 @@ static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base
             tmp_desc[ci].data_addr = (void *)
                 ((unsigned char*)base_addr + ci * size_buffer + header_size);
             PTPCOLL_VERBOSE(10, ("ml memory cache setup %d %d - %p", i, j, tmp_desc[ci].data_addr));
+
+            /* init reduce implementation flags */
+            tmp_desc[ci].reduce_init_called = false;
+            tmp_desc[ci].reduction_status = 0;
         }
     }
 
@@ -160,24 +212,33 @@ static int init_ml_buf_desc(mca_bcol_ptpcoll_ml_buffer_desc_t **desc, void *base
 static void mca_bcol_ptpcoll_set_small_msg_thresholds(struct mca_bcol_base_module_t *super)
 {
     mca_bcol_ptpcoll_module_t *ptpcoll_module =
-                   (mca_bcol_ptpcoll_module_t *) super;
+        (mca_bcol_ptpcoll_module_t *) super;
+    mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
+
+    /* Subtract out the maximum header size when calculating the thresholds. This
+     * will account for the headers used by the basesmuma component. If we do not
+     * take these headers into account we may overrun our buffer. */
 
     /* Set the Allgather threshold equals to a ML buff size */
     super->small_message_thresholds[BCOL_ALLGATHER] =
-                       ptpcoll_module->ml_mem.size_buffer /
-                       ompi_comm_size(ptpcoll_module->super.sbgp_partner_module->group_comm);
+        (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) /
+        ompi_comm_size(ptpcoll_module->super.sbgp_partner_module->group_comm);
 
     /* Set the Bcast threshold, all Bcast algths have the same threshold */
     super->small_message_thresholds[BCOL_BCAST] =
-                            ptpcoll_module->ml_mem.size_buffer;
+        (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX);
 
     /* Set the Alltoall threshold, the Ring algth sets some limitation */
     super->small_message_thresholds[BCOL_ALLTOALL] =
-                        ptpcoll_module->ml_mem.size_buffer / 2;
+        (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / 2;
 
     /* Set the Allreduce threshold, NARRAY algth sets some limitation */
     super->small_message_thresholds[BCOL_ALLREDUCE] =
-                 ptpcoll_module->ml_mem.size_buffer / ptpcoll_module->k_nomial_radix;
+        (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / ptpcoll_module->k_nomial_radix;
+
+    /* Set the Reduce threshold, NARRAY algth sets some limitation */
+    super->small_message_thresholds[BCOL_REDUCE] =
+        (ptpcoll_module->ml_mem.size_buffer - BCOL_HEADER_MAX) / cm->narray_radix;
 }
 
 /*
@@ -200,7 +261,7 @@ static int mca_bcol_ptpcoll_cache_ml_memory_info(struct mca_coll_ml_module_t *ml
     ml_mem->size_buffer = desc->size_buffer;
 
     PTPCOLL_VERBOSE(10, ("ML buffer configuration num banks %d num_per_bank %d size %d base addr %p",
-                           desc->num_banks, desc->num_buffers_per_bank, desc->size_buffer, desc->block->base_addr));
+                         desc->num_banks, desc->num_buffers_per_bank, desc->size_buffer, desc->block->base_addr));
 
     /* pointer to ml level descriptor */
     ml_mem->ml_mem_desc = desc;
@@ -209,19 +270,19 @@ static int mca_bcol_ptpcoll_cache_ml_memory_info(struct mca_coll_ml_module_t *ml
     ml_mem->bank_index_for_release = 0;
 
     if (OMPI_SUCCESS != init_ml_buf_desc(&ml_mem->ml_buf_desc,
-                desc->block->base_addr,
-                ml_mem->num_banks,
-                ml_mem->num_buffers_per_bank,
-                ml_mem->size_buffer,
-                ml_module->data_offset,
-                group_size,
-                ptpcoll_module->pow_k)) {
+                                         desc->block->base_addr,
+                                         ml_mem->num_banks,
+                                         ml_mem->num_buffers_per_bank,
+                                         ml_mem->size_buffer,
+                                         ml_module->data_offset,
+                                         group_size,
+                                         ptpcoll_module->pow_k)) {
         PTPCOLL_VERBOSE(10, ("Failed to allocate rdma memory descriptor\n"));
         return OMPI_ERROR;
     }
 
     PTPCOLL_VERBOSE(10, ("ml_module = %p, ptpcoll_module = %p, ml_mem_desc = %p.\n",
-                           ml_module, ptpcoll_module, ml_mem->ml_mem_desc));
+                         ml_module, ptpcoll_module, ml_mem->ml_mem_desc));
 
     return OMPI_SUCCESS;
 }
@@ -244,11 +305,12 @@ static void load_func(mca_bcol_ptpcoll_module_t *ptpcoll_module)
     ptpcoll_module->super.bcol_function_init_table[BCOL_BARRIER] = bcol_ptpcoll_barrier_init;
 
     ptpcoll_module->super.bcol_function_init_table[BCOL_BCAST] = bcol_ptpcoll_bcast_init;
-    ptpcoll_module->super.bcol_function_init_table[BCOL_ALLREDUCE] = NULL;
-    ptpcoll_module->super.bcol_function_init_table[BCOL_ALLGATHER] = NULL;
+    ptpcoll_module->super.bcol_function_init_table[BCOL_ALLREDUCE] = bcol_ptpcoll_allreduce_init;
+    ptpcoll_module->super.bcol_function_init_table[BCOL_ALLGATHER] = bcol_ptpcoll_allgather_init;
     ptpcoll_module->super.bcol_function_table[BCOL_BCAST] = bcol_ptpcoll_bcast_k_nomial_anyroot;
     ptpcoll_module->super.bcol_function_init_table[BCOL_ALLTOALL] = NULL;
     ptpcoll_module->super.bcol_function_init_table[BCOL_SYNC] = mca_bcol_ptpcoll_memsync_init;
+    ptpcoll_module->super.bcol_function_init_table[BCOL_REDUCE] = bcol_ptpcoll_reduce_init;
 
     /* ML memory cacher */
     ptpcoll_module->super.bcol_memory_init = mca_bcol_ptpcoll_cache_ml_memory_info;
@@ -262,16 +324,17 @@ static void load_func(mca_bcol_ptpcoll_module_t *ptpcoll_module)
 
 int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super)
 {
-   mca_bcol_ptpcoll_module_t *p2p_module = (mca_bcol_ptpcoll_module_t *) super;
-   int rc = 0; 
-   rc = netpatterns_setup_recursive_knomial_allgather_tree_node(
-                p2p_module->super.sbgp_partner_module->group_size,
-                p2p_module->super.sbgp_partner_module->my_index,
-                mca_bcol_ptpcoll_component.k_nomial_radix,
-                super->list_n_connected,
-                &p2p_module->knomial_allgather_tree);
+    mca_bcol_ptpcoll_module_t *p2p_module = (mca_bcol_ptpcoll_module_t *) super;
+    int rc = 0;
 
-  return rc;
+    rc = netpatterns_setup_recursive_knomial_allgather_tree_node(
+        p2p_module->super.sbgp_partner_module->group_size,
+        p2p_module->super.sbgp_partner_module->my_index,
+        mca_bcol_ptpcoll_component.k_nomial_radix,
+        super->list_n_connected,
+        &p2p_module->knomial_allgather_tree);
+
+    return rc;
 }
 
 /* The function used to calculate size */
@@ -301,9 +364,9 @@ static int load_narray_knomial_tree (mca_bcol_ptpcoll_module_t *ptpcoll_module)
     mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
 
     ptpcoll_module->full_narray_tree_size = calc_full_tree_size(
-            cm->narray_knomial_radix,
-            ptpcoll_module->group_size,
-            &ptpcoll_module->full_narray_tree_num_leafs);
+        cm->narray_knomial_radix,
+        ptpcoll_module->group_size,
+        &ptpcoll_module->full_narray_tree_num_leafs);
 
     ptpcoll_module->narray_knomial_proxy_extra_index = (int *)
         malloc(sizeof(int) * (cm->narray_knomial_radix));
@@ -313,21 +376,21 @@ static int load_narray_knomial_tree (mca_bcol_ptpcoll_module_t *ptpcoll_module)
     }
 
     ptpcoll_module->narray_knomial_node = calloc(
-            ptpcoll_module->full_narray_tree_size,
-            sizeof(netpatterns_narray_knomial_tree_node_t));
+        ptpcoll_module->full_narray_tree_size,
+        sizeof(netpatterns_narray_knomial_tree_node_t));
     if(NULL == ptpcoll_module->narray_knomial_node) {
         goto Error;
     }
 
     PTPCOLL_VERBOSE(10 ,("My type is proxy, full tree size = %d [%d]",
-                ptpcoll_module->full_narray_tree_size,
-                cm->narray_knomial_radix
-                ));
+                         ptpcoll_module->full_narray_tree_size,
+                         cm->narray_knomial_radix
+                        ));
 
     if (ptpcoll_module->super.sbgp_partner_module->my_index <
-            ptpcoll_module->full_narray_tree_size) {
+        ptpcoll_module->full_narray_tree_size) {
         if (ptpcoll_module->super.sbgp_partner_module->my_index <
-                ptpcoll_module->group_size - ptpcoll_module->full_narray_tree_size) {
+            ptpcoll_module->group_size - ptpcoll_module->full_narray_tree_size) {
             ptpcoll_module->narray_type = PTPCOLL_PROXY;
             for (i = 0; i < cm->narray_knomial_radix; i++) {
                 peer =
@@ -346,10 +409,10 @@ static int load_narray_knomial_tree (mca_bcol_ptpcoll_module_t *ptpcoll_module)
         /* Setting node info */
         for(i = 0; i < ptpcoll_module->full_narray_tree_size; i++) {
             rc = netpatterns_setup_narray_knomial_tree(
-                    cm->narray_knomial_radix,
-                    i,
-                    ptpcoll_module->full_narray_tree_size,
-                    &ptpcoll_module->narray_knomial_node[i]);
+                cm->narray_knomial_radix,
+                i,
+                ptpcoll_module->full_narray_tree_size,
+                &ptpcoll_module->narray_knomial_node[i]);
             if(OMPI_SUCCESS != rc) {
                 goto Error;
             }
@@ -381,17 +444,17 @@ static int load_narray_tree(mca_bcol_ptpcoll_module_t *ptpcoll_module)
     mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
 
     ptpcoll_module->narray_node = calloc(ptpcoll_module->group_size,
-            sizeof(netpatterns_tree_node_t));
+                                         sizeof(netpatterns_tree_node_t));
     if(NULL == ptpcoll_module->narray_node ) {
         goto Error;
     }
 
     for(i = 0; i < ptpcoll_module->group_size; i++) {
         rc = netpatterns_setup_narray_tree(
-                cm->narray_radix,
-                i,
-                ptpcoll_module->group_size,
-                &ptpcoll_module->narray_node[i]);
+            cm->narray_radix,
+            i,
+            ptpcoll_module->group_size,
+            &ptpcoll_module->narray_node[i]);
         if(OMPI_SUCCESS != rc) {
             goto Error;
         }
@@ -417,8 +480,8 @@ static int load_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
         cm->k_nomial_radix;
 
     ptpcoll_module->pow_k = pow_k_calc(ptpcoll_module->k_nomial_radix,
-            ptpcoll_module->group_size,
-            &ptpcoll_module->pow_knum);
+                                       ptpcoll_module->group_size,
+                                       &ptpcoll_module->pow_knum);
 
     ptpcoll_module->kn_proxy_extra_index = (int *)
         malloc(sizeof(int) * (ptpcoll_module->k_nomial_radix - 1));
@@ -430,37 +493,37 @@ static int load_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
     /* Setting peer type for K-nomial algorithm*/
     if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_knum ) {
         if (ptpcoll_module->super.sbgp_partner_module->my_index <
-                ptpcoll_module->group_size - ptpcoll_module->pow_knum) {
+            ptpcoll_module->group_size - ptpcoll_module->pow_knum) {
             for (i = 0;
-                    i < (ptpcoll_module->k_nomial_radix - 1) &&
-                    ptpcoll_module->super.sbgp_partner_module->my_index *
-                    (ptpcoll_module->k_nomial_radix - 1)  +
-                    i + ptpcoll_module->pow_knum < ptpcoll_module->group_size
-                    ; i++) {
+                 i < (ptpcoll_module->k_nomial_radix - 1) &&
+                     ptpcoll_module->super.sbgp_partner_module->my_index *
+                     (ptpcoll_module->k_nomial_radix - 1)  +
+                     i + ptpcoll_module->pow_knum < ptpcoll_module->group_size
+                     ; i++) {
                 ptpcoll_module->pow_ktype = PTPCOLL_KN_PROXY;
                 ptpcoll_module->kn_proxy_extra_index[i] =
                     ptpcoll_module->super.sbgp_partner_module->my_index *
                     (ptpcoll_module->k_nomial_radix - 1) +
                     i + ptpcoll_module->pow_knum;
                 PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_knum = %d [%d] my extra %d",
-                            ptpcoll_module->pow_knum,
-                            ptpcoll_module->pow_k,
-                            ptpcoll_module->kn_proxy_extra_index[i]));
+                                     ptpcoll_module->pow_knum,
+                                     ptpcoll_module->pow_k,
+                                     ptpcoll_module->kn_proxy_extra_index[i]));
             }
             ptpcoll_module->kn_proxy_extra_num = i;
         } else {
             PTPCOLL_VERBOSE(10 ,("My type is in group, pow_knum = %d [%d]", ptpcoll_module->pow_knum,
-                        ptpcoll_module->pow_k));
+                                 ptpcoll_module->pow_k));
             ptpcoll_module->pow_ktype = PTPCOLL_KN_IN_GROUP;
         }
     } else {
         ptpcoll_module->pow_ktype = PTPCOLL_KN_EXTRA;
         ptpcoll_module->kn_proxy_extra_index[0] = (ptpcoll_module->super.sbgp_partner_module->my_index -
-                ptpcoll_module->pow_knum) / (ptpcoll_module->k_nomial_radix - 1);
+                                                   ptpcoll_module->pow_knum) / (ptpcoll_module->k_nomial_radix - 1);
         PTPCOLL_VERBOSE(10 ,("My type is extra , pow_knum = %d [%d] my proxy %d",
-                    ptpcoll_module->pow_knum,
-                    ptpcoll_module->pow_k,
-                    ptpcoll_module->kn_proxy_extra_index[0]));
+                             ptpcoll_module->pow_knum,
+                             ptpcoll_module->pow_k,
+                             ptpcoll_module->kn_proxy_extra_index[0]));
     }
 
     return OMPI_SUCCESS;
@@ -476,8 +539,8 @@ Error:
 static int load_binomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
 {
     ptpcoll_module->pow_2 = pow_k_calc(2,
-            ptpcoll_module->group_size,
-            &ptpcoll_module->pow_2num);
+                                       ptpcoll_module->group_size,
+                                       &ptpcoll_module->pow_2num);
 
     assert(ptpcoll_module->pow_2num == 1 << ptpcoll_module->pow_2);
     assert(ptpcoll_module->pow_2num  <= ptpcoll_module->group_size);
@@ -485,20 +548,20 @@ static int load_binomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module)
     /* Setting peer type for binary algorithm*/
     if (ptpcoll_module->super.sbgp_partner_module->my_index < ptpcoll_module->pow_2num ) {
         if (ptpcoll_module->super.sbgp_partner_module->my_index <
-                ptpcoll_module->group_size - ptpcoll_module->pow_2num) {
+            ptpcoll_module->group_size - ptpcoll_module->pow_2num) {
             PTPCOLL_VERBOSE(10 ,("My type is proxy, pow_2num = %d [%d]", ptpcoll_module->pow_2num,
-                        ptpcoll_module->pow_2));
+                                 ptpcoll_module->pow_2));
             ptpcoll_module->pow_2type = PTPCOLL_PROXY;
             ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index +
                 ptpcoll_module->pow_2num;
         } else {
             PTPCOLL_VERBOSE(10 ,("My type is in group, pow_2num = %d [%d]", ptpcoll_module->pow_2num,
-                        ptpcoll_module->pow_2));
+                                 ptpcoll_module->pow_2));
             ptpcoll_module->pow_2type = PTPCOLL_IN_GROUP;
         }
     } else {
         PTPCOLL_VERBOSE(10 ,("My type is extra , pow_2num = %d [%d]", ptpcoll_module->pow_2num,
-                    ptpcoll_module->pow_2));
+                             ptpcoll_module->pow_2));
         ptpcoll_module->pow_2type = PTPCOLL_EXTRA;
         ptpcoll_module->proxy_extra_index = ptpcoll_module->super.sbgp_partner_module->my_index -
             ptpcoll_module->pow_2num;
@@ -510,10 +573,10 @@ static int load_recursive_knomial_info(mca_bcol_ptpcoll_module_t *ptpcoll_module
 {
     int rc = OMPI_SUCCESS;
     rc = netpatterns_setup_recursive_knomial_tree_node(
-                    ptpcoll_module->group_size,
-                    ptpcoll_module->super.sbgp_partner_module->my_index,
-                    mca_bcol_ptpcoll_component.k_nomial_radix,
-                    &ptpcoll_module->knomial_exchange_tree);
+        ptpcoll_module->group_size,
+        ptpcoll_module->super.sbgp_partner_module->my_index,
+        mca_bcol_ptpcoll_component.k_nomial_radix,
+        &ptpcoll_module->knomial_exchange_tree);
     return rc;
 }
 
@@ -523,14 +586,14 @@ static void bcol_ptpcoll_collreq_init(ompi_free_list_item_t *item, void* ctx)
     mca_bcol_ptpcoll_collreq_t *collreq = (mca_bcol_ptpcoll_collreq_t *) item;
 
     switch(mca_bcol_ptpcoll_component.barrier_alg) {
-        case 1:
-            collreq->requests = (ompi_request_t **)
-                                     calloc(2, sizeof(ompi_request_t *));
-            break;
-        case 2:
-            collreq->requests = (ompi_request_t **)
-                calloc(2 * ptpcoll_module->k_nomial_radix, sizeof(ompi_request_t *));
-            break;
+    case 1:
+        collreq->requests = (ompi_request_t **)
+            calloc(2, sizeof(ompi_request_t *));
+        break;
+    case 2:
+        collreq->requests = (ompi_request_t **)
+            calloc(2 * ptpcoll_module->k_nomial_radix, sizeof(ompi_request_t *));
+        break;
     }
 }
 
@@ -539,7 +602,7 @@ static void bcol_ptpcoll_collreq_init(ompi_free_list_item_t *item, void* ctx)
  * the backing shared-memory file is created.
  */
 mca_bcol_base_module_t **mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp,
-        int *num_modules)
+                                                     int *num_modules)
 {
     int rc;
     /* local variables */
@@ -612,33 +675,32 @@ mca_bcol_base_module_t **mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbg
     /* creating collfrag free list */
     OBJ_CONSTRUCT(&ptpcoll_module->collreqs_free, ompi_free_list_t);
     rc = ompi_free_list_init_ex_new(&ptpcoll_module->collreqs_free,
-                                  sizeof(mca_bcol_ptpcoll_collreq_t),
-                                  BCOL_PTP_CACHE_LINE_SIZE,
-                                  OBJ_CLASS(mca_bcol_ptpcoll_collreq_t),
-                                  0, BCOL_PTP_CACHE_LINE_SIZE,
-                                  256 /* free_list_num */,
-                                  -1  /* free_list_max, -1 = infinite */,
-                                  32  /* free_list_inc */,
-                                  NULL,
-                                  bcol_ptpcoll_collreq_init,
-                                  ptpcoll_module);
+                                    sizeof(mca_bcol_ptpcoll_collreq_t),
+                                    BCOL_PTP_CACHE_LINE_SIZE,
+                                    OBJ_CLASS(mca_bcol_ptpcoll_collreq_t),
+                                    0, BCOL_PTP_CACHE_LINE_SIZE,
+                                    256 /* free_list_num */,
+                                    -1  /* free_list_max, -1 = infinite */,
+                                    32  /* free_list_inc */,
+                                    NULL,
+                                    bcol_ptpcoll_collreq_init,
+                                    ptpcoll_module);
     if (OMPI_SUCCESS != rc) {
         goto CLEANUP;
     }
 
     load_func(ptpcoll_module);
-    /*
+
     rc = alloc_allreduce_offsets_array(ptpcoll_module);
     if (OMPI_SUCCESS != rc) {
         goto CLEANUP;
     }
-    */
 
     /* Allocating iovec for PTP alltoall */
     iovec_size = ptpcoll_module->group_size / 2 + ptpcoll_module->group_size % 2;
     ptpcoll_module->alltoall_iovec = (struct iovec *) malloc(sizeof(struct iovec)
-                                                            * iovec_size);
-    ptpcoll_module->log_group_size = lognum(ptpcoll_module->group_size); 
+                                                             * iovec_size);
+    ptpcoll_module->log_group_size = lognum(ptpcoll_module->group_size);
 
     rc = mca_bcol_base_bcol_fns_table_init(&(ptpcoll_module->super));
     if (OMPI_SUCCESS != rc) {
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.c b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.c
new file mode 100644
index 0000000000..29d118b290
--- /dev/null
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.c
@@ -0,0 +1,406 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "bcol_ptpcoll_reduce.h"
+#include "bcol_ptpcoll_utils.h"
+
+static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args,
+        struct coll_ml_function_t *const_args);
+
+static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args,
+        struct coll_ml_function_t *const_args);
+
+
+#define NARRAY_RECV_NB(narray_node, process_shift, group_size,                            \
+                        recv_buffer, pack_len, tag, comm, recv_requests,                      \
+                        num_pending_recvs)                                                 \
+do {                                                                                       \
+    int n, rc = OMPI_SUCCESS;                                                              \
+    int dst;                                                                               \
+    int comm_dst;                                                                          \
+    int offset = 0 ;                                                                       \
+                                                                                           \
+    /* Recieve data from all relevant childrens  */                                        \
+    for (n = 0; n < narray_node->n_children; n++) {                                        \
+                                                                                           \
+        dst = narray_node->children_ranks[n] + process_shift;                              \
+        if (dst >= group_size) {                                                           \
+            dst -= group_size;                                                             \
+        }                                                                                  \
+        comm_dst = group_list[dst];                                                        \
+                                                                                           \
+        /* Non blocking send .... */                                                       \
+        PTPCOLL_VERBOSE(1 , ("Reduce, Irecv data to %d[%d], count %d, tag %d, addr %p",    \
+                    dst, comm_dst, pack_len, tag,                                             \
+                    data_buffer));                                                         \
+        rc = MCA_PML_CALL(irecv((void *)((unsigned char*)recv_buffer + offset), pack_len, MPI_BYTE,                     \
+                    comm_dst, tag, comm,                                      \
+                    &(recv_requests[*num_pending_recvs])));                                \
+        if( OMPI_SUCCESS != rc ) {                                                         \
+            PTPCOLL_VERBOSE(10, ("Failed to start non-blocking receive"));                 \
+            return OMPI_ERROR;                                                             \
+        }                                                                                  \
+        ++(*num_pending_recvs);                                                            \
+        offset += pack_len;                                                                \
+    }                                                                                      \
+} while(0)
+
+
+static inline int narray_reduce(void *data_buffer, void *recv_buffer,
+                                int nrecvs, int count,
+                                struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+                                int *reduction_status) {
+    int pack_len = count * dtype->super.size;
+    int i = 0;
+    void *source_buffer = NULL, *result_buffer = NULL;
+
+    source_buffer = data_buffer;
+    result_buffer = recv_buffer;
+
+    for (i = 0; i < nrecvs; i++) {
+        ompi_op_reduce(op, (void*)((unsigned char*) source_buffer) ,
+                       (void*)((unsigned char*) result_buffer),
+                       count,dtype);
+
+        source_buffer = (void *)((unsigned char*)recv_buffer
+                                 + (i+1) * pack_len);
+    }
+
+    *reduction_status = 1;
+    return OMPI_SUCCESS;
+}
+static int bcol_ptpcoll_reduce_narray_progress(bcol_function_args_t *input_args,
+        struct coll_ml_function_t *const_args)
+{
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
+
+    int tag = -1;
+    int rc;
+    int group_size = ptpcoll_module->group_size;
+    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
+    uint32_t buffer_index = input_args->buffer_index;
+    struct ompi_op_t *op = input_args->op;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    ompi_request_t **send_request =
+        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0];
+    ompi_request_t **recv_requests =
+        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1];
+    void *data_buffer = NULL;
+    void *src_buffer = (void *) (
+            (unsigned char *)input_args->sbuf +
+            (size_t)input_args->sbuf_offset);
+    void *recv_buffer = (void *) (
+            (unsigned char *)input_args->rbuf +
+            (size_t)input_args->rbuf_offset);
+    int count = input_args->count;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    int pack_len = input_args->count * input_args->dtype->super.size;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int matched = false;
+    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
+    int relative_group_index = 0;
+    netpatterns_tree_node_t *narray_node = NULL;
+    bool not_sent = false;
+    int parent_rank  = -1, comm_parent_rank = -1;
+    int group_root_index = input_args->root;
+
+    if (!ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called) {
+        bcol_ptpcoll_reduce_narray(input_args, const_args);
+    }
+    /*
+     * By default the src buffer is the data buffer,
+     * only after reduction, the recv buffer becomes the
+     * data buffer
+     */
+    data_buffer = src_buffer;
+
+    relative_group_index = my_group_index - group_root_index;
+    if (relative_group_index < 0) {
+        relative_group_index +=group_size;
+    }
+
+    /* keep tag within the limit support by the pml */
+    tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
+    /* mark this as a collective tag, to avoid conflict with user-level tags */
+    tag = -tag;
+
+    narray_node = &ptpcoll_module->narray_node[relative_group_index];
+
+    PTPCOLL_VERBOSE(3, ("reduce, Narray tree Progress"));
+
+    PTPCOLL_VERBOSE(8, ("bcol_ptpcoll_reduce_narray, buffer index: %d "
+                         "tag: %d "
+                         "tag_mask: %d "
+                         "sn: %d "
+                         "root: %d [%d]"
+                         "buff: %p ",
+                         buffer_index, tag,
+                         ptpcoll_module->tag_mask, input_args->sequence_num,
+                         input_args->root_flag, input_args->root_route->rank,
+                         data_buffer));
+
+    /*
+      Check if the data was received
+     */
+    if (0 != *active_requests) {
+        matched = mca_bcol_ptpcoll_test_all_for_match
+            (active_requests, recv_requests, &rc);
+        if (OMPI_SUCCESS != rc) {
+            return OMPI_ERROR;
+        }
+
+
+        /* All data was received, then do a reduction*/
+        if(matched) {
+           narray_reduce(data_buffer, recv_buffer, narray_node->n_children, count, dtype, op,
+                   &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status);
+
+           /*
+            * The reduction result is in the recv buffer, so it is the new data
+            * buffer
+            */
+           data_buffer = recv_buffer;
+
+           /* If not reduced, means also, you might not posted a send */
+            not_sent = true;
+        } else {
+            PTPCOLL_VERBOSE(10, ("reduce root is started"));
+            return BCOL_FN_STARTED;
+        }
+    }
+
+    /* I'm root, I'm done  */
+    if (input_args->root_flag) {
+       return BCOL_FN_COMPLETE;
+    }
+
+    PTPCOLL_VERBOSE(1,("Testing Sending Match"));
+
+    /* If send was not posted */
+    /* Manju: Leaf node should never post in the progress logic */
+    if (not_sent) {
+        parent_rank =
+            ptpcoll_module->narray_node[relative_group_index].parent_rank +
+            group_root_index;
+        if (parent_rank >= group_size) {
+            parent_rank -= group_size;
+        }
+
+        comm_parent_rank = group_list[parent_rank];
+        PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank));
+
+        rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE,
+                    comm_parent_rank,
+                    tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request));
+        if( OMPI_SUCCESS != rc ) {
+            PTPCOLL_VERBOSE(10, ("Failed to send data"));
+            return OMPI_ERROR;
+        }
+    }
+
+    if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) {
+        PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
+        /* Data has not been sent. Return that the collective has been stated
+         * because we MUST call test on this request once it is finished to
+         * ensure that it is properly freed. */
+        return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
+    }
+
+    return BCOL_FN_COMPLETE;
+}
+
+static int bcol_ptpcoll_reduce_narray(bcol_function_args_t *input_args,
+        struct coll_ml_function_t *const_args)
+{
+    mca_bcol_ptpcoll_module_t *ptpcoll_module = (mca_bcol_ptpcoll_module_t *)const_args->bcol_module;
+
+    int tag;
+    int rc;
+    int group_size = ptpcoll_module->group_size;
+    int *group_list = ptpcoll_module->super.sbgp_partner_module->group_list;
+    uint32_t buffer_index = input_args->buffer_index;
+
+    struct ompi_op_t *op = input_args->op;
+    ompi_communicator_t* comm = ptpcoll_module->super.sbgp_partner_module->group_comm;
+    ompi_request_t **recv_requests =
+        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[1];
+    ompi_request_t **send_request =
+        &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].requests[0];
+
+    void *data_buffer = NULL;
+    void *src_buffer = (void *) (
+            (unsigned char *)input_args->sbuf +
+            (size_t)input_args->sbuf_offset);
+    void *recv_buffer = (void *) (
+            (unsigned char *)input_args->rbuf +
+            (size_t)input_args->rbuf_offset);
+    int count = input_args->count;
+    struct ompi_datatype_t *dtype = input_args->dtype;
+    int pack_len = input_args->count * input_args->dtype->super.size;
+    int *active_requests =
+        &(ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].active_requests);
+    int matched = true;
+    int my_group_index = ptpcoll_module->super.sbgp_partner_module->my_index;
+    int group_root_index  = -1;
+    int relative_group_index = 0;
+    netpatterns_tree_node_t *narray_node = NULL;
+    int parent_rank  = -1, comm_parent_rank = -1;
+
+
+    /* This is first function call that should be called, not progress.
+     * The fragmentation code does this, so switch from progress to here.
+     * The flag indicates whether, we have entered this code *
+     */
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduce_init_called = true;
+
+    PTPCOLL_VERBOSE(1, ("Reduce, Narray tree"));
+    /* reset active request counter */
+    (*active_requests) = 0;
+    /* keep tag within the limit support by the pml */
+    tag = (PTPCOLL_TAG_OFFSET + input_args->sequence_num * PTPCOLL_TAG_FACTOR) & (ptpcoll_module->tag_mask);
+    /* mark this as a collective tag, to avoid conflict with user-level flags */
+    tag = -tag;
+
+    PTPCOLL_VERBOSE(1, ("bcol_ptpcoll_reduce_narray, buffer index: %d "
+                         "tag: %d "
+                         "tag_mask: %d "
+                         "sn: %d "
+                         "root: %d "
+                         "buff: %p ",
+                         buffer_index, tag,
+                         ptpcoll_module->tag_mask, input_args->sequence_num,
+                         input_args->root_flag,
+                         src_buffer));
+
+    /* Compute Root Index Shift */
+    group_root_index = input_args->root;
+    relative_group_index = my_group_index - group_root_index;
+    if (relative_group_index < 0) {
+        relative_group_index += group_size;
+    }
+
+    narray_node = &ptpcoll_module->narray_node[relative_group_index];
+
+    if (0 == narray_node->n_children) {
+        PTPCOLL_VERBOSE(10, ("I'm leaf of the data"));
+        /*
+         * I'm root of the operation
+         * send data to N childrens
+         */
+        data_buffer = src_buffer;
+        goto NARRAY_SEND_DATA;
+    }
+
+    /* Not leaf, either an internal node or root */
+    NARRAY_RECV_NB(narray_node, group_root_index, group_size,
+                    recv_buffer, pack_len, tag, comm, recv_requests,
+                    active_requests);
+
+
+    /* We have not done reduction, yet */
+    ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status = 0;
+
+    /* We can not block. So run couple of test for data arrival */
+    matched = mca_bcol_ptpcoll_test_all_for_match
+        (active_requests, recv_requests, &rc);
+
+    /* Check if received the data */
+    if(matched) {
+
+        narray_reduce(src_buffer, recv_buffer, narray_node->n_children,
+                        count, dtype, op, &ptpcoll_module->ml_mem.ml_buf_desc[buffer_index].reduction_status);
+        PTPCOLL_VERBOSE(1, ("Reduce, received data from  all childrend "));
+        data_buffer = recv_buffer;
+
+    } else {
+
+        PTPCOLL_VERBOSE(1, ("reduce root is started"));
+        return BCOL_FN_STARTED;
+    }
+
+    /* I'm root, I'm done  */
+    if (input_args->root_flag) {
+       return BCOL_FN_COMPLETE;
+    }
+
+
+NARRAY_SEND_DATA:
+
+    /*
+     * Send the data (reduce in case of internal nodes, or just data in
+     * case of leaf nodes) to the parent
+     */
+    narray_node = &ptpcoll_module->narray_node[relative_group_index];
+
+    parent_rank =
+        ptpcoll_module->narray_node[relative_group_index].parent_rank +
+        group_root_index;
+    if (parent_rank >= group_size) {
+        parent_rank -= group_size;
+    }
+
+    comm_parent_rank = group_list[parent_rank];
+    PTPCOLL_VERBOSE(1,("Sending data to %d ",comm_parent_rank));
+
+    rc = MCA_PML_CALL(isend(data_buffer, pack_len, MPI_BYTE,
+                comm_parent_rank,
+                tag, MCA_PML_BASE_SEND_STANDARD, comm, send_request));
+    if( OMPI_SUCCESS != rc ) {
+        PTPCOLL_VERBOSE(10, ("Failed to send data"));
+        return OMPI_ERROR;
+    }
+
+    /* We can not block. So run couple of test for data arrival */
+    if (0 == mca_bcol_ptpcoll_test_for_match(send_request, &rc)) {
+        PTPCOLL_VERBOSE(10, ("Test was not matched - %d", rc));
+        /* No data was received, return no match error */
+        return (OMPI_SUCCESS != rc) ? rc : BCOL_FN_STARTED;
+    }
+
+    return BCOL_FN_COMPLETE;
+}
+
+
+int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super)
+{
+    mca_bcol_base_coll_fn_comm_attributes_t comm_attribs;
+    mca_bcol_base_coll_fn_invoke_attributes_t inv_attribs;
+
+    PTPCOLL_VERBOSE(1,("Initialization Reduce - Narray"));
+    comm_attribs.bcoll_type = BCOL_REDUCE;
+    comm_attribs.comm_size_min = 0;
+    comm_attribs.comm_size_max = 1024 * 1024;
+    comm_attribs.waiting_semantics = NON_BLOCKING;
+
+    inv_attribs.bcol_msg_min = 0;
+    inv_attribs.bcol_msg_max = 20000; /* range 1 */
+
+    inv_attribs.datatype_bitmap = 0xffffffff;
+    inv_attribs.op_types_bitmap = 0xffffffff;
+
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+    mca_bcol_base_set_attributes(super, &comm_attribs, &inv_attribs,
+                bcol_ptpcoll_reduce_narray,
+                bcol_ptpcoll_reduce_narray_progress);
+
+    comm_attribs.data_src = DATA_SRC_KNOWN;
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.h b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.h
new file mode 100644
index 0000000000..195ce7fad9
--- /dev/null
+++ b/ompi/mca/bcol/ptpcoll/bcol_ptpcoll_reduce.h
@@ -0,0 +1,25 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_BCOL_PTPCOLL_REDUCE_H
+#define MCA_BCOL_PTPCOLL_REDUCE_H
+
+#include "ompi_config.h"
+#include "bcol_ptpcoll.h"
+#include "bcol_ptpcoll_utils.h"
+
+BEGIN_C_DECLS
+
+int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super);
+
+int bcol_ptpcoll_reduce_init(mca_bcol_base_module_t *super);
+
+#endif /* MCA_BCOL_PTPCOLL_REDUCE_H */
diff --git a/ompi/mca/coll/ml/Makefile.am b/ompi/mca/coll/ml/Makefile.am
index 498570fdc5..e3c6f88852 100644
--- a/ompi/mca/coll/ml/Makefile.am
+++ b/ompi/mca/coll/ml/Makefile.am
@@ -1,6 +1,8 @@
 #
 # Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
 # Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+# Copyright (c) 2013 Los Alamos National Security, LLC. All rights
+#                    reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@@ -30,12 +32,19 @@ sources = coll_ml.h \
         coll_ml_hier_algorithms.c \
         coll_ml_hier_algorithms_setup.c \
         coll_ml_hier_algorithms_bcast_setup.c \
+        coll_ml_hier_algorithms_allreduce_setup.c \
+        coll_ml_hier_algorithms_reduce_setup.c \
         coll_ml_hier_algorithms_common_setup.c \
         coll_ml_hier_algorithms_common_setup.h \
+        coll_ml_hier_algorithms_allgather_setup.c \
         coll_ml_hier_algorithm_memsync_setup.c \
         coll_ml_custom_utils.h \
         coll_ml_custom_utils.c \
+        coll_ml_hier_algorithms_ibarrier.c \
         coll_ml_progress.c \
+        coll_ml_reduce.c \
+        coll_ml_allreduce.c \
+        coll_ml_allgather.c \
         coll_ml_mca.h \
         coll_ml_mca.c \
         coll_ml_lmngr.h \
diff --git a/ompi/mca/coll/ml/coll_ml.h b/ompi/mca/coll/ml/coll_ml.h
index e129ddc53e..56fab98f00 100644
--- a/ompi/mca/coll/ml/coll_ml.h
+++ b/ompi/mca/coll/ml/coll_ml.h
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -90,6 +93,13 @@ enum {
     ML_NUM_OF_FUNCTIONS
 };
 
+/* ML broadcast algorithms */
+enum {
+    COLL_ML_STATIC_BCAST,
+    COLL_ML_SEQ_BCAST,
+    COLL_ML_UNKNOWN_BCAST,
+};
+
 struct mca_bcol_base_module_t;
 /* function description */
 struct coll_ml_function_t {
@@ -210,8 +220,6 @@ typedef struct coll_ml_collective_description_t coll_ml_collective_description_t
 struct rank_properties_t {
     int rank;
     int leaf;
-    int n_connected_subgroups;
-    int *list_connected_subgroups;
     int num_of_ranks_represented;
 }; typedef struct rank_properties_t rank_properties_t;
 
@@ -237,17 +245,11 @@ struct sub_group_params_t {
 
     /*
      * level in the hierarchy - subgroups at the same
-     * level don't
-     * overlap.
+     * level don't overlap. May not be the same as the
+     * sbgp level.
      */
     int level_in_hierarchy;
 
-    /*
-     * Connected nodes
-     */
-    int n_connected_nodes;
-    int *list_connected_nodes;
-
     /*
      * Information on the ranks in the subgroup.  This includes
      * the rank, and wether or not the rank is a source/sink of
@@ -255,11 +257,6 @@ struct sub_group_params_t {
      */
     rank_properties_t *rank_data;
 
-    /*
-     * Temp list of ranks
-     */
-    int *list_ranks;
-
     /* level one index - for example,
        for( i = 0; i < level_one_index; i++) will loop
        through all level one subgroups, this is significant
@@ -267,7 +264,6 @@ struct sub_group_params_t {
        i.e. all ranks appear once and only once at level one
      */
     int level_one_index;
-
 };
 typedef struct sub_group_params_t sub_group_params_t;
 
@@ -397,6 +393,7 @@ do {                                                                \
     /* pasha - why we duplicate it ? */                             \
     (coll_op)->variable_fn_params.src_desc = desc;                  \
     (coll_op)->variable_fn_params.hier_factor = 1;                  \
+    (coll_op)->variable_fn_params.need_dt_support = false;          \
 } while (0)
 
 /*Full message descriptor*/
@@ -486,9 +483,6 @@ struct mca_coll_ml_component_t {
     /** MCA parameter: Priority of this component */
     int ml_priority;
 
-    /** MCA parameter: Number of levels */
-    int ml_n_levels;
-
     /** MCA parameter: subgrouping components to use */
     char *subgroups_string;
 
@@ -519,17 +513,14 @@ struct mca_coll_ml_component_t {
 
     int use_knomial_allreduce;
 
-    /* Use global knowledge bcast algorithm */
-    bool use_static_bcast;
-
     /* use hdl_framework */
     bool use_hdl_bcast;
 
     /* Enable / Disable fragmentation (0 - off, 1 - on, 2 - auto) */
     int enable_fragmentation;
 
-    /* Use sequential bcast algorithm */
-    bool use_sequential_bcast;
+    /* Broadcast algorithm */
+    int bcast_algorithm;
 
     /* frag size that is used by list memory_manager */
     size_t lmngr_block_size;
@@ -584,6 +575,9 @@ struct mca_coll_ml_component_t {
     /* Temporary hack for IMB test - not all bcols have alltoall */
     bool disable_alltoall;
 
+    /* Disable Reduce */
+    bool disable_reduce;
+
     /* Brucks alltoall mca and other params */
     int use_brucks_smsg_alltoall;
 
@@ -727,6 +721,11 @@ struct mca_coll_ml_module_t {
     mca_coll_ml_collective_operation_description_t *
         coll_ml_allreduce_functions[ML_NUM_ALLREDUCE_FUNCTIONS];
 
+    /** Reduce functions */
+    mca_coll_ml_collective_operation_description_t *
+        coll_ml_reduce_functions[ML_NUM_REDUCE_FUNCTIONS];
+
+
     /** scatter */
     mca_coll_ml_collective_operation_description_t *
         coll_ml_scatter_functions[ML_NUM_SCATTER_FUNCTIONS];
@@ -764,9 +763,6 @@ struct mca_coll_ml_module_t {
     uint64_t fragment_size;
     uint32_t ml_fragment_size;
 
-    /* For carto graph */
-    /* opal_carto_graph_t *sm_graph; */
-    /* opal_carto_graph_t *ib_graph; */
     /* Bcast index table. Pasha: Do we need to define something more generic ?
      the table  x 2 (large/small)*/
     int bcast_fn_index_table[2];
@@ -784,6 +780,9 @@ struct mca_coll_ml_module_t {
     /* On this list we keep coll_op descriptors that were not
      * be able to start, since no ml buffers were available */
     opal_list_t waiting_for_memory_list;
+
+    /* fallback collectives */
+    mca_coll_base_comm_coll_t fallback;
 };
 
 typedef struct mca_coll_ml_module_t mca_coll_ml_module_t;
@@ -812,25 +811,46 @@ int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm,
                                ompi_request_t **req,
                                mca_coll_base_module_t *module);
 
+/* Allreduce with EXTRA TOPO using - blocking */
 int mca_coll_ml_allreduce_dispatch(void *sbuf, void *rbuf, int count,
                                 struct ompi_datatype_t *dtype, struct ompi_op_t *op,
                                 struct ompi_communicator_t *comm, mca_coll_base_module_t *module);
 
+/* Allreduce with EXTRA TOPO using - Non-blocking */
+int mca_coll_ml_allreduce_dispatch_nb(void *sbuf, void *rbuf, int count,
+                                   ompi_datatype_t *dtype, ompi_op_t *op,
+                                   ompi_communicator_t *comm,
+                                   ompi_request_t **req,
+                                   mca_coll_base_module_t *module);
+
 /* Allreduce - blocking */
-int mca_coll_ml_allreduce_intra(void *sbuf, void *rbuf, int count,
+int mca_coll_ml_allreduce(void *sbuf, void *rbuf, int count,
                                 struct ompi_datatype_t *dtype, struct ompi_op_t *op,
                                 struct ompi_communicator_t *comm,
                                 mca_coll_base_module_t *module);
 
-int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *module, int bank_index);
+/* Allreduce - Non-blocking */
+int mca_coll_ml_allreduce_nb(void *sbuf, void *rbuf, int count,
+                                struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+                                struct ompi_communicator_t *comm,
+                                ompi_request_t **req,
+                                mca_coll_base_module_t *module);
 
-/* Reduce blocking */
+/* Reduce - Blocking */
 int mca_coll_ml_reduce(void *sbuf, void *rbuf, int count,
         struct ompi_datatype_t *dtype, struct ompi_op_t *op,
-        int root,
-        struct ompi_communicator_t *comm,
+        int root, struct ompi_communicator_t *comm,
         mca_coll_base_module_t *module);
 
+int mca_coll_ml_reduce_nb(void *sbuf, void *rbuf, int count,
+        struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+        int root, struct ompi_communicator_t *comm,
+        ompi_request_t **req,
+        mca_coll_base_module_t *module);
+
+int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *module, int bank_index);
+
+
 int coll_ml_progress_individual_message(mca_coll_ml_fragment_t *frag_descriptor);
 
 /*
@@ -902,6 +922,17 @@ int mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery(mca_coll_ml_module_t
 
 void mca_coll_ml_allreduce_matrix_init(mca_coll_ml_module_t *ml_module,
                      const mca_bcol_base_component_2_0_0_t *bcol_component);
+static inline int mca_coll_ml_err(const char* fmt, ...)
+{
+    va_list list;
+    int ret;
+
+    va_start(list, fmt);
+    ret = vfprintf(stderr, fmt, list);
+    va_end(list);
+    return ret;
+}
+
 
 #define ML_ERROR(args)                                       \
 do {                                                     \
diff --git a/ompi/mca/coll/ml/coll_ml_allgather.c b/ompi/mca/coll/ml/coll_ml_allgather.c
new file mode 100644
index 0000000000..8f86bcbd0d
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_allgather.c
@@ -0,0 +1,631 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/** @file */
+
+#include "ompi_config.h"
+
+#include <stdlib.h>
+
+#include "ompi/constants.h"
+#include "opal/threads/mutex.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "opal/sys/atomic.h"
+#include "coll_ml.h"
+#include "coll_ml_select.h"
+#include "coll_ml_allocation.h"
+
+static int mca_coll_ml_allgather_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    bool rcontig = coll_op->full_message.recv_data_continguous;
+    int n_ranks_in_comm = ompi_comm_size(OP_ML_MODULE(coll_op)->comm);
+
+    void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
+            (uintptr_t)coll_op->full_message.n_bytes_delivered);
+    void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
+            (size_t)coll_op->variable_fn_params.rbuf_offset);
+
+    if (rcontig) {
+        memcpy(dest, src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled);
+    } else {
+        mca_coll_ml_convertor_unpack(src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled,
+                                          &coll_op->fragment_data.message_descriptor->recv_convertor);
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static inline void copy_data (mca_coll_ml_collective_operation_progress_t *coll_op, rank_properties_t *rank_props, int soffset) {
+    bool rcontig = coll_op->fragment_data.message_descriptor->recv_data_continguous;
+    size_t total_bytes = coll_op->fragment_data.message_descriptor->n_bytes_total;
+    size_t pack_len = coll_op->fragment_data.fragment_size;
+    int doffset = rank_props->rank;
+    void *dest, *src;
+
+    src = (void *) ((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
+                    (size_t)coll_op->variable_fn_params.rbuf_offset + soffset * pack_len);
+
+    if (rcontig) {
+        dest = (void *) ((uintptr_t) coll_op->full_message.dest_user_addr +
+                         (uintptr_t) coll_op->fragment_data.offset_into_user_buffer +
+                         doffset * total_bytes);
+
+        memcpy(dest, src, pack_len);
+    } else {
+        size_t position;
+        opal_convertor_t *recv_convertor =
+            &coll_op->fragment_data.message_descriptor->recv_convertor;
+
+        position = (size_t) coll_op->fragment_data.offset_into_user_buffer +
+            doffset * total_bytes;
+
+        opal_convertor_set_position(recv_convertor, &position);
+        mca_coll_ml_convertor_unpack(src, pack_len, recv_convertor);
+    }
+}
+
+static int mca_coll_ml_allgather_noncontiguous_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    int i, j, n_level_one_sbgps;
+    size_t soffset;
+
+    mca_coll_ml_topology_t *topo_info = coll_op->coll_schedule->topo_info;
+    sub_group_params_t *array_of_all_subgroup_ranks = topo_info->array_of_all_subgroups;
+
+    n_level_one_sbgps = array_of_all_subgroup_ranks->level_one_index;
+
+    for (i = 0 ; i < n_level_one_sbgps; i++) {
+        /* determine where in the source buffer the data can be found */
+        soffset = array_of_all_subgroup_ranks[i].index_of_first_element;
+        for (j = 0 ; j < array_of_all_subgroup_ranks[i].n_ranks; j++, ++soffset) {
+            copy_data (coll_op, array_of_all_subgroup_ranks[i].rank_data + j, soffset);
+        }
+    }
+
+    return OMPI_SUCCESS;
+}
+
+/* Allgather dependencies seem easy, everyone needs to work from the "bottom up".
+ * Following Pasha, I too will put the simplest dependencies graph and change it later
+ * when we add hierarchy. Basically, allgather has the same dependency profile as the
+ * sequential broadcast except that there is only a single ordering of tasks.
+ */
+static int mca_coll_ml_allgather_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    int fn_idx, h_level, my_index, root;
+    mca_sbgp_base_module_t *sbgp;
+    mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;
+
+    fn_idx      = coll_op->sequential_routine.current_active_bcol_fn;
+    h_level     = coll_op->coll_schedule->component_functions[fn_idx].h_level;
+    sbgp        = topo->component_pairs[h_level].
+                  subgroup_module;
+    my_index    = sbgp->my_index;
+
+    /* In the case of allgather, the local leader is always the root */
+    root = 0;
+    if (my_index == root) {
+        coll_op->variable_fn_params.root_flag = true;
+        coll_op->variable_fn_params.root_route = NULL;
+    } else {
+        coll_op->variable_fn_params.root_flag = false;
+        coll_op->variable_fn_params.root_route = &topo->route_vector[root];
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    /* local variables */
+    int ret;
+    size_t frag_len, dt_size;
+
+    void *buf;
+    ml_payload_buffer_desc_t *src_buffer_desc;
+    mca_coll_ml_collective_operation_progress_t *new_op;
+
+    mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
+    bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous;
+
+    ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size);
+    /* Keep the pipeline filled with fragments */
+    while (coll_op->fragment_data.message_descriptor->n_active <
+        coll_op->fragment_data.message_descriptor->pipeline_depth) {
+        /* If an active fragment happens to have completed the collective during
+         * a hop into the progress engine, then don't launch a new fragment,
+         * instead break and return.
+         */
+        if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
+            == coll_op->fragment_data.message_descriptor->n_bytes_total) {
+            break;
+        }
+        /* Get an ml buffer */
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        if (NULL == src_buffer_desc) {
+            /* If there exist outstanding fragments, then break out
+             * and let an active fragment deal with this later,
+             * there are no buffers available.
+             */
+            if (0 < coll_op->fragment_data.message_descriptor->n_active) {
+                return OMPI_SUCCESS;
+            } else {
+                /* The fragment is already on list and
+                 * the we still have no ml resources
+                 * Return busy */
+                if (coll_op->pending & REQ_OUT_OF_MEMORY) {
+                    ML_VERBOSE(10,("Out of resources %p", coll_op));
+                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+                }
+
+                coll_op->pending |= REQ_OUT_OF_MEMORY;
+                opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
+                        (opal_list_item_t *)coll_op);
+                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
+                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+            }
+        }
+
+        /* Get a new collective descriptor and initialize it */
+        new_op =  mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                     ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
+                     coll_op->fragment_data.message_descriptor->src_user_addr,
+                     coll_op->fragment_data.message_descriptor->dest_user_addr,
+                     coll_op->fragment_data.message_descriptor->n_bytes_total,
+                     coll_op->fragment_data.message_descriptor->n_bytes_scheduled);
+
+        new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
+        new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;
+
+        /* set the task setup callback  */
+        new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
+
+        /*
+        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
+                src_buffer_desc->buffer_index, src_buffer_desc);
+        */
+
+        /* We need this address for pointer arithmetic in memcpy */
+        buf = coll_op->fragment_data.message_descriptor->src_user_addr;
+
+        if (!scontig) {
+            frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER];
+            mca_coll_ml_convertor_get_send_frag_size(
+                            ml_module, &frag_len,
+                            coll_op->fragment_data.message_descriptor);
+
+            mca_coll_ml_convertor_pack(
+                (void *) ((uintptr_t) src_buffer_desc->data_addr +
+                frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
+                frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
+                frag_len, &coll_op->fragment_data.message_descriptor->send_convertor);
+       } else {
+            /* calculate new frag length, there are some issues here */
+            frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total -
+                    coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
+                    coll_op->fragment_data.fragment_size ?
+                    coll_op->fragment_data.message_descriptor->n_bytes_total -
+                    coll_op->fragment_data.message_descriptor->n_bytes_scheduled :
+                    coll_op->fragment_data.fragment_size);
+
+            /* everybody copies in, based on the new values */
+            memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr +
+                    frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset +
+                    frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
+                    (void *) ((uintptr_t) buf + (uintptr_t)
+                            coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len);
+        }
+
+        new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
+        new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
+
+        /* update the number of bytes scheduled */
+        new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
+        /* everyone needs an unpack function */
+        new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
+
+        new_op->fragment_data.fragment_size = frag_len;
+        new_op->fragment_data.buffer_desc = src_buffer_desc;
+
+        /* Setup fragment specific data */
+        ++(new_op->fragment_data.message_descriptor->n_active);
+
+        ML_VERBOSE(10, ("Start more, My index %d ",
+                    new_op->fragment_data.buffer_desc->buffer_index));
+
+        /* this is a bit buggy */
+        ML_SET_VARIABLE_PARAMS_BCAST(
+                new_op,
+                OP_ML_MODULE(new_op),
+                frag_len /* yes, we have consistent units, so this makes sense */,
+                MPI_BYTE /* we fragment according to buffer size
+                          * we don't reduce the data thus we needn't
+                          * keep "whole" datatypes, we may freely
+                          * fragment without regard for multiples
+                          * of any specific datatype
+                          */,
+                src_buffer_desc,
+                0,
+                0,
+                frag_len,
+                src_buffer_desc->data_addr);
+        /* initialize first coll */
+        ret = new_op->sequential_routine.seq_task_setup(new_op);
+        if (OMPI_SUCCESS != ret) {
+            ML_VERBOSE(3, ("Fragment failed to initialize itself"));
+            return ret;
+        }
+
+        new_op->variable_fn_params.buffer_size = frag_len;
+        new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
+        new_op->variable_fn_params.root = 0;
+
+        MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
+
+        /* append this collective !! */
+        OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
+        opal_list_append(&mca_coll_ml_component.sequential_collectives,
+                                    (opal_list_item_t *)new_op);
+        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static inline __opal_attribute_always_inline__
+int mca_coll_ml_allgather_start (void *sbuf, int scount,
+                                 struct ompi_datatype_t *sdtype,
+                                 void* rbuf, int rcount,
+                                 struct ompi_datatype_t *rdtype,
+                                 struct ompi_communicator_t *comm,
+                                 mca_coll_base_module_t *module,
+                                 ompi_request_t **req)
+{
+    size_t pack_len, sdt_size;
+    int ret, n_fragments = 1, comm_size;
+
+    mca_coll_ml_topology_t *topo_info;
+    ml_payload_buffer_desc_t *src_buffer_desc;
+
+    mca_coll_ml_component_t *cm = &mca_coll_ml_component;
+
+    mca_coll_ml_collective_operation_progress_t *coll_op;
+    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
+
+    ptrdiff_t lb, extent;
+    bool scontig, rcontig, in_place = false;
+
+    /* check for in place setting */
+    if (MPI_IN_PLACE == sbuf) {
+        in_place = true;
+        sdtype = rdtype;
+        scount = rcount;
+    }
+
+    /* scontig could be != to rcontig */
+    scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount);
+    rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount);
+
+    comm_size = ompi_comm_size(comm);
+
+    ML_VERBOSE(10, ("Starting allgather"));
+
+    assert(NULL != sdtype);
+    /* Calculate size of the data,
+     * at this stage, only contiguous data is supported */
+
+    /* this is valid for allagther */
+    ompi_datatype_type_size(sdtype, &sdt_size);
+    pack_len = scount * sdt_size;
+
+    if (in_place) {
+        sbuf = rbuf + ompi_comm_rank(comm) * pack_len;
+    }
+
+    /* Allocate collective schedule and pack message */
+    /* this is the total ending message size that will need to fit in the ml-buffer */
+    if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) {
+        /* The len of the message can not be larger than ML buffer size */
+        ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer));
+        assert(pack_len * comm_size <= ml_module->payload_block->size_buffer);
+
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        while (NULL == src_buffer_desc) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        /* change 1 */
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
+                sbuf, rbuf, pack_len, 0 /* offset for first pack */);
+
+        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
+                src_buffer_desc->buffer_index, src_buffer_desc);
+
+        coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
+        /* task setup callback function */
+        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
+
+        /* change 2 */
+        if (!scontig) {
+            coll_op->full_message.n_bytes_scheduled =
+                mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
+                    &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);
+
+            mca_coll_ml_convertor_pack(
+                        (void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len *
+                                  (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
+                                   coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
+                        pack_len, &coll_op->full_message.send_convertor);
+        } else {
+            /* change 3 */
+            memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len *
+                            (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
+                             coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
+                   sbuf, pack_len);
+
+            coll_op->full_message.n_bytes_scheduled = pack_len;
+        }
+
+        if (!rcontig) {
+            mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
+                &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
+        }
+
+        if (coll_op->coll_schedule->topo_info->ranks_contiguous) {
+            coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data;
+        } else {
+            coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
+        }
+
+        /* whole ml-buffer is used to send AND receive */
+        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
+
+        /* we can set the initial offset here */
+        coll_op->variable_fn_params.sbuf_offset = 0;
+        coll_op->variable_fn_params.rbuf_offset = 0;
+
+        coll_op->variable_fn_params.count = scount;
+        coll_op->fragment_data.fragment_size =
+                             coll_op->full_message.n_bytes_scheduled;
+
+        /* For small CINCO, we may use the native datatype */
+        coll_op->variable_fn_params.dtype = sdtype;
+        coll_op->variable_fn_params.buffer_size = pack_len;
+        coll_op->variable_fn_params.root = 0;
+    } else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) {
+        /* calculate the number of fragments and the size of each frag */
+        size_t n_dts_per_frag, frag_len;
+        int pipeline_depth = mca_coll_ml_component.pipeline_depth;
+
+        /* Calculate the number of fragments required for this message careful watch the integer division !*/
+        frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ?
+                pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]);
+
+        n_dts_per_frag = frag_len / sdt_size;
+        n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag);
+        pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);
+
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        while (NULL == src_buffer_desc) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        /* change 4 */
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
+                sbuf, rbuf, pack_len,
+                0 /* offset for first pack */);
+
+        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
+                src_buffer_desc->buffer_index, src_buffer_desc);
+        topo_info = coll_op->coll_schedule->topo_info;
+
+        /* task setup callback function */
+        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
+
+        if (!scontig) {
+            coll_op->full_message.send_converter_bytes_packed =
+                        mca_coll_ml_convertor_prepare(
+                                sdtype, scount, NULL,
+                                &coll_op->full_message.dummy_convertor,
+                                MCA_COLL_ML_NET_STREAM_SEND);
+
+            coll_op->full_message.dummy_conv_position = 0;
+            mca_coll_ml_convertor_get_send_frag_size(
+                                    ml_module, &frag_len,
+                                    &coll_op->full_message);
+
+            /* change 5 */
+            mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
+                    &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);
+
+            mca_coll_ml_convertor_pack(
+                    (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len *
+                              (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
+                               coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
+                    frag_len, &coll_op->full_message.send_convertor);
+        } else {
+            /* change 6 */
+            memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len *
+                            (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
+                             coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
+                    sbuf, frag_len);
+        }
+
+        if (!rcontig) {
+            mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
+                    &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
+        }
+
+        coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
+
+        /* hopefully this doesn't royaly screw things up idea behind this is the
+         * whole ml-buffer is used to send and receive
+         */
+        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
+
+        /* we can set the initial offset here */
+        coll_op->variable_fn_params.sbuf_offset = 0;
+        coll_op->variable_fn_params.rbuf_offset = 0;
+
+        coll_op->fragment_data.buffer_desc = src_buffer_desc;
+
+        coll_op->fragment_data.fragment_size = frag_len;
+        coll_op->fragment_data.message_descriptor->n_active = 1;
+
+        coll_op->full_message.n_bytes_scheduled = frag_len;
+        coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress;
+
+        coll_op->full_message.pipeline_depth = pipeline_depth;
+        coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
+
+        /* remember this is different for frags !! Caused data corruption when
+         * not properly set. Need to be sure you have consistent units.
+         */
+        coll_op->variable_fn_params.count = frag_len;
+        coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in
+                                                       * units of bytes. This means that
+                                                       * all of our arithmetic is done
+                                                       * in terms of bytes
+                                                       */
+
+        coll_op->variable_fn_params.root = 0;
+        coll_op->variable_fn_params.frag_size = frag_len;
+        coll_op->variable_fn_params.buffer_size = frag_len;
+    } else {
+        /* change 7 */
+        ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case."));
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER],
+                sbuf, rbuf, pack_len, 0 /* offset for first pack */);
+        topo_info = coll_op->coll_schedule->topo_info;
+        if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) {
+            MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL);
+        } else {
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+            while (NULL == src_buffer_desc) {
+                opal_progress();
+                src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+            }
+
+            MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc);
+        }
+
+        /* not sure if I really need this here */
+        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
+        coll_op->process_fn = NULL;
+        /* probably the most important piece */
+        coll_op->variable_fn_params.sbuf = sbuf;
+        coll_op->variable_fn_params.rbuf = rbuf;
+        coll_op->variable_fn_params.sbuf_offset = 0;
+        coll_op->variable_fn_params.rbuf_offset = 0;
+        coll_op->variable_fn_params.count = scount;
+        coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the
+                                                    * native datatype and actual count
+                                                    */
+        coll_op->variable_fn_params.root = 0;
+
+        /* you still need to copy in your own data into the rbuf */
+        /* don't need to do this if you have in place data */
+        if (!in_place) {
+            memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len);
+        }
+    }
+
+    coll_op->full_message.send_count = scount;
+    coll_op->full_message.recv_count = rcount;
+
+    coll_op->full_message.send_data_continguous = scontig;
+    coll_op->full_message.recv_data_continguous = rcontig;
+
+    ompi_datatype_get_extent(sdtype, &lb, &extent);
+    coll_op->full_message.send_extent = (size_t) extent;
+
+    ompi_datatype_get_extent(rdtype, &lb, &extent);
+    coll_op->full_message.recv_extent = (size_t) extent;
+
+
+    /* Fill in the function arguments */
+    coll_op->variable_fn_params.sequence_num =
+        OPAL_THREAD_ADD64(&(ml_module->collective_sequence_num), 1);
+    coll_op->variable_fn_params.hier_factor = comm_size;
+
+    MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
+
+
+    ret = mca_coll_ml_launch_sequential_collective (coll_op);
+    if (OMPI_SUCCESS != ret) {
+        ML_VERBOSE(10, ("Failed to launch"));
+        return ret;
+    }
+
+    *req = &coll_op->full_message.super;
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_ml_allgather(void *sbuf, int scount,
+                          struct ompi_datatype_t *sdtype,
+                          void* rbuf, int rcount,
+                          struct ompi_datatype_t *rdtype,
+                          struct ompi_communicator_t *comm,
+                          mca_coll_base_module_t *module)
+{
+    ompi_request_t *req;
+    int ret;
+
+    ML_VERBOSE(10, ("Starting blocking allgather"));
+
+    ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
+                                       rbuf, rcount, rdtype,
+                                       comm, module, &req);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        return ret;
+    }
+
+    ret = ompi_request_wait (&req, MPI_STATUS_IGNORE);
+
+    ML_VERBOSE(10, ("Blocking allgather is complete"));
+
+    return ret;
+}
+
+int mca_coll_ml_allgather_nb(void *sbuf, int scount,
+                             struct ompi_datatype_t *sdtype,
+                             void* rbuf, int rcount,
+                             struct ompi_datatype_t *rdtype,
+                             struct ompi_communicator_t *comm,
+                             ompi_request_t **req,
+                             mca_coll_base_module_t *module)
+{
+    int ret;
+
+    ML_VERBOSE(10, ("Starting non-blocking allgather"));
+
+    ret = mca_coll_ml_allgather_start (sbuf, scount, sdtype,
+                                       rbuf, rcount, rdtype,
+                                       comm, module, req);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        return ret;
+    }
+
+    ML_VERBOSE(10, ("Non-blocking allgather started"));
+
+    return ret;
+}
diff --git a/ompi/mca/coll/ml/coll_ml_allocation.c b/ompi/mca/coll/ml/coll_ml_allocation.c
index 767fbad7cf..119b58b235 100644
--- a/ompi/mca/coll/ml/coll_ml_allocation.c
+++ b/ompi/mca/coll/ml/coll_ml_allocation.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
@@ -20,10 +21,8 @@
 
 long memory_buffer_index;
 
-ml_memory_block_desc_t *mca_coll_ml_allocate_block(
-                struct mca_coll_ml_component_t *ml_component,
-                ml_memory_block_desc_t *ml_memblock
-                )
+ml_memory_block_desc_t *mca_coll_ml_allocate_block(struct mca_coll_ml_component_t *ml_component,
+                                                   ml_memory_block_desc_t *ml_memblock)
 {
     ml_memory_block_desc_t *ret = NULL;
     ml_memory_block_desc_t *memory_block = NULL;
@@ -33,7 +32,7 @@ ml_memory_block_desc_t *mca_coll_ml_allocate_block(
         ML_ERROR(("Memory already allocated - expecting NULL pointer"));
         return ret;
     }
-    memory_block = (ml_memory_block_desc_t*) malloc(sizeof(ml_memory_block_desc_t));
+    memory_block = (ml_memory_block_desc_t*) calloc(1, sizeof(ml_memory_block_desc_t));
 
     if (NULL == memory_block){
         ML_ERROR(("Couldn't allocate memory for ml_memblock"));
@@ -61,38 +60,31 @@ exit_ERROR:
     return ret;
 }
 
-void mca_coll_ml_free_block(
-         ml_memory_block_desc_t *ml_memblock
-        )
+void mca_coll_ml_free_block (ml_memory_block_desc_t *ml_memblock)
 {
-    ml_payload_buffer_desc_t **pbuff_descs = NULL;
-
     if (!ml_memblock)
         return;
 
-    if(pbuff_descs){
-        free(pbuff_descs);
+    if (ml_memblock->buffer_descs){
+        free(ml_memblock->buffer_descs);
     }
 
     mca_coll_ml_lmngr_free(ml_memblock->block);
-    free(ml_memblock->buffer_descs);
     free(ml_memblock->bank_release_counters);
     free(ml_memblock->ready_for_memsync);
     free(ml_memblock->bank_is_busy);
     free(ml_memblock);
 }
 
-int mca_coll_ml_initialize_block(
-                ml_memory_block_desc_t *ml_memblock,
-                uint32_t num_buffers,
-                uint32_t num_banks,
-                uint32_t buffer_size,
-                int32_t data_offset,
-                opal_list_t *bcols_in_use
-                )
+int mca_coll_ml_initialize_block(ml_memory_block_desc_t *ml_memblock,
+                                 uint32_t num_buffers,
+                                 uint32_t num_banks,
+                                 uint32_t buffer_size,
+                                 int32_t data_offset,
+                                 opal_list_t *bcols_in_use)
 {
     int ret = OMPI_SUCCESS;
-    uint32_t loop, bank_loop, buff_loop;
+    uint32_t bank_loop, buff_loop;
     uint64_t addr_offset = 0;
     ml_payload_buffer_desc_t *pbuff_descs = NULL,*pbuff_desc = NULL;
 
@@ -122,30 +114,31 @@ int mca_coll_ml_initialize_block(
 
             addr_offset+=buffer_size;
             pbuff_desc->buffer_index = BUFFER_INDEX(bank_loop,num_buffers,buff_loop);
-#if 0
-            ML_ERROR(("Setting buffer_index %lld %d %d", pbuff_desc->buffer_index,
-                        BANK_FROM_BUFFER_IDX(pbuff_desc->buffer_index),
-                        BUFFER_FROM_BUFFER_IDX(pbuff_desc->buffer_index)));
-#endif
+
             pbuff_desc->bank_index=bank_loop;
             pbuff_desc->generation_number=0;
         }
 
     /* Initialize ml memory block */
-    ml_memblock->bank_release_counters = (uint32_t *) malloc(sizeof(uint32_t) *
-            num_banks);
+    /* gvm FIX:This counter when zero indicates that the bank is ready for
+     * recycle. This is  initialized to number of bcol components as each bcol is responsible for
+     * releasing the buffers of a bank. This initialization will have
+     * faulty behavior, example in case of multiple interfaces,  when more than
+     * one bcol module of the component type is in use.
+     */
+    ml_memblock->bank_release_counters = (uint32_t *) calloc(num_banks, sizeof(uint32_t));
     if (NULL == ml_memblock->bank_release_counters) {
         ret = OMPI_ERR_OUT_OF_RESOURCE;
         goto exit_ERROR;
     }
 
-    ml_memblock->ready_for_memsync = (bool *)malloc(sizeof(bool) * num_banks);
+    ml_memblock->ready_for_memsync = (bool *) calloc(num_banks, sizeof(bool));
     if (NULL == ml_memblock->ready_for_memsync) {
         ret = OMPI_ERR_OUT_OF_RESOURCE;
         goto exit_ERROR;
     }
 
-    ml_memblock->bank_is_busy = (bool *)malloc(sizeof(bool) * num_banks);
+    ml_memblock->bank_is_busy = (bool *) calloc(num_banks, sizeof(bool));
     if (NULL == ml_memblock->bank_is_busy) {
         ret = OMPI_ERR_OUT_OF_RESOURCE;
         goto exit_ERROR;
@@ -154,18 +147,6 @@ int mca_coll_ml_initialize_block(
     /* Set index for first bank to sync */
     ml_memblock->memsync_counter = 0;
 
-    /* gvm FIX:This counter when zero indicates that the bank is ready for
-     * recycle. This is  initialized to number of bcol components as each bcol is responsible for
-     * releasing the buffers of a bank. This initialization will have
-     * faulty behavior, example in case of multiple interfaces,  when more than
-     * one bcol module of the component type is in use.
-     */
-    for(loop = 0; loop < num_banks; loop++) {
-        ml_memblock->bank_release_counters[loop] = 0;
-        ml_memblock->ready_for_memsync[loop] = false;
-        ml_memblock->bank_is_busy[loop] = false;
-    }
-
     /* use first bank and first buffer */
     ml_memblock->next_free_buffer = 0;
 
@@ -186,22 +167,20 @@ exit_ERROR:
     return ret;
 }
 
-ml_payload_buffer_desc_t *mca_coll_ml_alloc_buffer(
-            mca_coll_ml_module_t *module){
-
+ml_payload_buffer_desc_t *mca_coll_ml_alloc_buffer (mca_coll_ml_module_t *module)
+{
     uint64_t bindex;
     uint32_t bank, buffer, num_buffers;
     ml_memory_block_desc_t *ml_memblock = module->payload_block;
     ml_payload_buffer_desc_t *pbuff_descs = NULL,
-                             *ml_membuffer = NULL;
+        *ml_membuffer = NULL;
 
     /* Return a buffer */
     num_buffers = ml_memblock->num_buffers_per_bank;
     pbuff_descs = ml_memblock->buffer_descs;
     bindex = ml_memblock->next_free_buffer;
-	buffer = bindex%num_buffers;
-	bank = bindex/num_buffers;
-
+    buffer = bindex % num_buffers;
+    bank = bindex/num_buffers;
 
     ML_VERBOSE(10, ("ML allocator: allocating buffer index %d, bank index %d", buffer, bank));
 
@@ -210,13 +189,12 @@ ml_payload_buffer_desc_t *mca_coll_ml_alloc_buffer(
         if(!ml_memblock->bank_is_busy[bank]) {
             /* the bank is free, mark it busy */
             ml_memblock->bank_is_busy[bank] = true;
-            ML_VERBOSE(10, ("ML allocator: reset bank %d to value %d",
-                        bank,
-                        ml_memblock->bank_release_counters[bank]));
+            ML_VERBOSE(10, ("ML allocator: reset bank %d to value %d", bank,
+                            ml_memblock->bank_release_counters[bank]));
         } else {
             /* the bank is busy, return NULL and upper layer will handle it */
-            ML_VERBOSE(10, ("No free payload buffers are available for use.\
-                        Next memory bank is still used by one of bcols \n"));
+            ML_VERBOSE(10, ("No free payload buffers are available for use."
+                            " Next memory bank is still used by one of bcols"));
             return NULL;
         }
     }
@@ -227,11 +205,9 @@ ml_payload_buffer_desc_t *mca_coll_ml_alloc_buffer(
     ML_VERBOSE(10, ("ML allocator: ml buffer index %d", bindex));
 
     /* Compute next free buffer */
-    ++buffer;
-    buffer %= num_buffers;
+    buffer = (buffer == num_buffers - 1) ? 0 : buffer + 1;
     if (0 == buffer) {
-        ++bank;
-        bank %= ml_memblock->num_banks;
+        bank = (bank == ml_memblock->num_banks - 1) ? 0 : bank + 1;
     }
 
     ml_memblock->next_free_buffer = BUFFER_INDEX(bank,num_buffers,buffer);
diff --git a/ompi/mca/coll/ml/coll_ml_allreduce.c b/ompi/mca/coll/ml/coll_ml_allreduce.c
new file mode 100644
index 0000000000..7a25d908cd
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_allreduce.c
@@ -0,0 +1,553 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/** @file */
+
+#include "ompi_config.h"
+
+#include <stdlib.h>
+
+#include "ompi/constants.h"
+#include "opal/threads/mutex.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "opal/sys/atomic.h"
+#include "coll_ml.h"
+#include "coll_ml_select.h"
+#include "coll_ml_allocation.h"
+
+static int mca_coll_ml_allreduce_small_unpack(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    int ret;
+    /* need to put in more */
+    int count = coll_op->variable_fn_params.count;
+    ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype;
+
+    void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
+            (uintptr_t)coll_op->fragment_data.offset_into_user_buffer);
+    void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
+            (size_t)coll_op->variable_fn_params.rbuf_offset);
+
+    ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest,
+            (char *) src);
+    if (ret < 0) {
+        return OMPI_ERROR;
+    }
+
+    ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, rbuf addr %p, rbuf offset %d.",
+                    src, coll_op->variable_fn_params.sbuf_offset, dest,
+                    coll_op->variable_fn_params.rbuf_offset));
+
+    return OMPI_SUCCESS;
+}
+
+static int mca_coll_ml_allreduce_task_setup(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    int fn_idx, h_level, my_index, root;
+    mca_sbgp_base_module_t *sbgp;
+    mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;
+
+    fn_idx      = coll_op->sequential_routine.current_active_bcol_fn;
+    h_level     = coll_op->coll_schedule->component_functions[fn_idx].h_level;
+    sbgp        = topo->component_pairs[h_level].subgroup_module;
+    my_index    = sbgp->my_index;
+
+    /* In the case of allreduce, the local leader is always the root */
+    root = 0;
+    if (my_index == root) {
+        coll_op->variable_fn_params.root_flag = true;
+        coll_op->variable_fn_params.root_route = NULL;
+    } else {
+        coll_op->variable_fn_params.root_flag = false;
+        coll_op->variable_fn_params.root_route = &topo->route_vector[root];
+    }
+
+    /* NTH: This was copied from the old allreduce launcher. */
+    if (0 < fn_idx) {
+        coll_op->variable_fn_params.sbuf = coll_op->variable_fn_params.rbuf;
+        coll_op->variable_fn_params.userbuf = coll_op->variable_fn_params.rbuf;
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static int mca_coll_ml_allreduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    /* local variables */
+    void *buf;
+
+    size_t dt_size;
+    int ret, frag_len, count;
+
+    ptrdiff_t lb, extent;
+
+    ml_payload_buffer_desc_t *src_buffer_desc;
+    mca_coll_ml_collective_operation_progress_t *new_op;
+
+    mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
+
+    ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent);
+    if (ret < 0) {
+     return OMPI_ERROR;
+    }
+
+    dt_size = (size_t) extent;
+
+    /* Keep the pipeline filled with fragments */
+    while (coll_op->fragment_data.message_descriptor->n_active <
+        coll_op->fragment_data.message_descriptor->pipeline_depth) {
+        /* If an active fragment happens to have completed the collective during
+         * a hop into the progress engine, then don't launch a new fragment,
+         * instead break and return.
+         */
+        if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
+            == coll_op->fragment_data.message_descriptor->n_bytes_total) {
+            break;
+        }
+
+        /* Get an ml buffer */
+        src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
+        if (NULL == src_buffer_desc) {
+            /* If there exist outstanding fragments, then break out
+             * and let an active fragment deal with this later,
+             * there are no buffers available.
+             */
+            if (0 < coll_op->fragment_data.message_descriptor->n_active) {
+                return OMPI_SUCCESS;
+            }
+
+            /* It is useless to call progress from here, since
+             * ml progress can't be executed as result ml memsync
+             * call will not be completed and no memory will be
+             * recycled. So we put the element on the list, and we will
+             * progress it later when memsync will recycle some memory*/
+
+            /* The fragment is already on list and
+             * the we still have no ml resources
+             * Return busy */
+            if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
+                coll_op->pending |= REQ_OUT_OF_MEMORY;
+                opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
+                                 (opal_list_item_t *)coll_op);
+                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
+            } else {
+                ML_VERBOSE(10,("Out of resources %p", coll_op));
+            }
+
+            return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+        }
+
+        /* Get a new collective descriptor and initialize it */
+        new_op =  mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_allreduce_functions[coll_op->fragment_data.current_coll_op],
+                coll_op->fragment_data.message_descriptor->src_user_addr,
+                coll_op->fragment_data.message_descriptor->dest_user_addr,
+                coll_op->fragment_data.message_descriptor->n_bytes_total,
+                coll_op->fragment_data.message_descriptor->n_bytes_scheduled);
+
+        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
+                src_buffer_desc->buffer_index, src_buffer_desc);
+
+        new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
+        new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;
+
+        /* set the task setup callback  */
+        new_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup;
+        /* We need this address for pointer arithmetic in memcpy */
+        buf = coll_op->fragment_data.message_descriptor->src_user_addr;
+        /* calculate the number of data types in this packet */
+        count = (coll_op->fragment_data.message_descriptor->n_bytes_total -
+                coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
+                 (size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_ALLREDUCE] ?
+                (coll_op->fragment_data.message_descriptor->n_bytes_total -
+                coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size :
+                (size_t) coll_op->variable_fn_params.count);
+
+        /* calculate the fragment length */
+        frag_len = count*dt_size;
+
+        ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count,
+                (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t)
+                    coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
+        if (ret < 0) {
+            return OMPI_ERROR;
+        }
+
+        /* No unpack for root */
+        new_op->process_fn = mca_coll_ml_allreduce_small_unpack;
+
+        /* Setup fragment specific data */
+        new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
+        new_op->fragment_data.buffer_desc = src_buffer_desc;
+        new_op->fragment_data.fragment_size = frag_len;
+        (new_op->fragment_data.message_descriptor->n_active)++;
+
+        ML_SET_VARIABLE_PARAMS_BCAST(
+                new_op,
+                OP_ML_MODULE(new_op),
+                count,
+                MPI_BYTE,
+                src_buffer_desc,
+                0,
+                0,
+                frag_len,
+                src_buffer_desc->data_addr);
+        /* Fill in bcast specific arguments */
+        /* TBD: remove buffer_size */
+        new_op->variable_fn_params.buffer_size = frag_len;
+        new_op->variable_fn_params.count = count;
+        new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
+        new_op->variable_fn_params.op = coll_op->variable_fn_params.op;
+        new_op->variable_fn_params.dtype = coll_op->variable_fn_params.dtype;
+        new_op->variable_fn_params.root = 0;
+        new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
+        new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
+        new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
+
+        MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
+
+        ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d\n",
+                    new_op->variable_fn_params.buffer_size,
+                    new_op->fragment_data.fragment_size,
+                    new_op->fragment_data.message_descriptor->n_bytes_scheduled));
+        /* initialize first coll */
+        ret = new_op->sequential_routine.seq_task_setup(new_op);
+        if (OMPI_SUCCESS != ret) {
+            ML_VERBOSE(3,("Fragment failed to initialize itself"));
+            return ret;
+        }
+
+        /* append this collective !! */
+        OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
+        opal_list_append(&mca_coll_ml_component.sequential_collectives,
+                (opal_list_item_t *)new_op);
+        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
+
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static inline __opal_attribute_always_inline__
+int parallel_allreduce_start(void *sbuf, void *rbuf, int count,
+                                struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+                                struct ompi_communicator_t *comm,
+                                mca_coll_ml_module_t *ml_module,
+                                ompi_request_t **req,
+                                int small_data_allreduce,
+                                int large_data_allreduce)
+{
+    int ret, n_fragments = 1, frag_len,
+        pipeline_depth, n_dts_per_frag ;
+
+    ptrdiff_t lb, extent;
+    size_t pack_len, dt_size;
+
+    ml_payload_buffer_desc_t *src_buffer_desc;
+    mca_coll_ml_collective_operation_progress_t *coll_op;
+
+    mca_coll_ml_component_t *cm = &mca_coll_ml_component;
+
+    bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count);
+
+    if (MPI_IN_PLACE == sbuf) {
+        sbuf = rbuf;
+    }
+
+    ret = ompi_datatype_get_extent(dtype, &lb, &extent);
+    if (ret < 0) {
+        return OMPI_ERROR;
+    }
+
+    dt_size = (size_t) extent;
+    pack_len = count * dt_size;
+
+    ML_VERBOSE(1,("The allreduce requested %d enable fragmentation %d ",
+                    pack_len,
+                    cm->enable_fragmentation));
+    if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) {
+        /* The len of the message can not be larger than ML buffer size */
+        assert(pack_len <= ml_module->payload_block->size_buffer);
+
+        ML_VERBOSE(1,("Using small data allreduce (threshold = %d)",
+                    ml_module->small_message_thresholds[BCOL_ALLREDUCE]));
+
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        while (OPAL_UNLIKELY(NULL == src_buffer_desc)) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_allreduce_functions[small_data_allreduce],
+                sbuf, rbuf, pack_len, 0);
+
+        coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.count = count;
+
+        ret = ompi_datatype_copy_content_same_ddt(dtype, count,
+                (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf);
+        if (ret < 0){
+            return OMPI_ERROR;
+        }
+
+        /* unpack function */
+        coll_op->process_fn = mca_coll_ml_allreduce_small_unpack;
+    } else if (cm->enable_fragmentation || !contiguous) {
+        ML_VERBOSE(1,("Using Fragmented Allreduce"));
+
+        /* fragment the data */
+        /* check for retarded application programming decisions */
+        if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_ALLREDUCE]) {
+            ML_ERROR(("Sorry, but we don't support datatypes that large"));
+            return OMPI_ERROR;
+        }
+
+        /* calculate the number of data types that can fit per ml-buffer */
+        n_dts_per_frag = ml_module->small_message_thresholds[BCOL_ALLREDUCE] / dt_size;
+
+        /* calculate the number of fragments */
+        n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */
+
+        /* calculate the actual pipeline depth */
+        pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth;
+
+        /* calculate the fragment size */
+        frag_len = n_dts_per_frag * dt_size;
+
+        /* allocate an ml buffer */
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        while (NULL == src_buffer_desc) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_allreduce_functions[small_data_allreduce],
+                sbuf, rbuf, pack_len, 0 /* offset for first pack */);
+
+        /* task setup callback function */
+        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup;
+
+        coll_op->process_fn = mca_coll_ml_allreduce_small_unpack;
+
+        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
+
+        coll_op->fragment_data.message_descriptor->n_active = 1;
+        coll_op->full_message.n_bytes_scheduled = frag_len;
+        coll_op->full_message.fragment_launcher = mca_coll_ml_allreduce_frag_progress;
+        coll_op->full_message.pipeline_depth = pipeline_depth;
+        coll_op->fragment_data.current_coll_op = small_data_allreduce;
+        coll_op->fragment_data.fragment_size = frag_len;
+
+        coll_op->variable_fn_params.count = n_dts_per_frag;  /* seems fishy */
+        coll_op->variable_fn_params.buffer_size = frag_len;
+
+        /* copy into the ml-buffer */
+        ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag,
+                (char *) src_buffer_desc->data_addr, (char *) sbuf);
+        if (ret < 0) {
+            return OMPI_ERROR;
+        }
+    } else {
+        ML_VERBOSE(1,("Using zero-copy ptp allreduce"));
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_allreduce_functions[large_data_allreduce],
+                sbuf, rbuf, pack_len, 0);
+
+        coll_op->variable_fn_params.userbuf =
+            coll_op->variable_fn_params.sbuf = sbuf;
+
+        coll_op->variable_fn_params.rbuf = rbuf;
+
+        /* The ML buffer is used for testing. Later, when we
+         * switch to use knem/mmap/portals this should be replaced
+         * appropriately
+         */
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        while (NULL == src_buffer_desc) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        coll_op->variable_fn_params.count = count;
+    }
+
+    MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index,
+                                          src_buffer_desc);
+
+    /* set the offset */
+    coll_op->variable_fn_params.sbuf_offset = 0;
+    coll_op->variable_fn_params.rbuf_offset = 0;
+
+    /* Fill in the function arguments */
+    coll_op->variable_fn_params.sequence_num =
+        OPAL_THREAD_ADD64(&(ml_module->collective_sequence_num), 1);
+    coll_op->sequential_routine.current_active_bcol_fn = 0;
+    coll_op->variable_fn_params.dtype = dtype;
+    coll_op->variable_fn_params.op = op;
+    coll_op->variable_fn_params.root = 0;
+    coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allreduce_task_setup; /* invoked after each level in sequential
+                                                                                    * progress call
+                                                                                    */
+    MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);
+
+    ret = mca_coll_ml_launch_sequential_collective (coll_op);
+    if (ret != OMPI_SUCCESS) {
+        ML_VERBOSE(10, ("Failed to launch"));
+        return ret;
+    }
+
+    *req = &coll_op->full_message.super;
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_ml_allreduce(void *sbuf, void *rbuf, int count,
+                           struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+                           struct ompi_communicator_t *comm,
+                           mca_coll_base_module_t *module)
+{
+    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
+    ompi_request_t *req;
+    int ret;
+
+    if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
+        fprintf (stderr, "Falling back for allreduce\n");
+        /* coll/ml does not handle non-communative operations at this time. fallback
+         * on another collective module */
+        return ml_module->fallback.coll_allreduce (sbuf, rbuf, count, dtype, op, comm,
+                                                   ml_module->fallback.coll_allreduce_module);
+    }
+
+    ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm,
+                                   (mca_coll_ml_module_t *) module, &req,
+                                    ML_SMALL_DATA_ALLREDUCE,
+                                    ML_LARGE_DATA_ALLREDUCE);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_ERROR(("Failed to launch"));
+        return ret;
+    }
+
+    ompi_request_wait_completion(req);
+    ompi_request_free(&req);
+
+    ML_VERBOSE(10, ("Blocking NB allreduce is done"));
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_ml_allreduce_nb(void *sbuf, void *rbuf, int count,
+                           struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+                           struct ompi_communicator_t *comm,
+                           ompi_request_t **req,
+                           mca_coll_base_module_t *module)
+{
+    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
+    int ret;
+
+    if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
+        fprintf (stderr, "Falling back for iallreduce\n");
+        /* coll/ml does not handle non-communative operations at this time. fallback
+         * on another collective module */
+        return ml_module->fallback.coll_iallreduce (sbuf, rbuf, count, dtype, op, comm, req,
+                                                    ml_module->fallback.coll_iallreduce_module);
+    }
+
+    ret = parallel_allreduce_start(sbuf, rbuf, count, dtype, op, comm,
+                                   (mca_coll_ml_module_t *) module, req,
+                                    ML_SMALL_DATA_ALLREDUCE,
+                                    ML_LARGE_DATA_ALLREDUCE);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_ERROR(("Failed to launch"));
+        return ret;
+    }
+
+    ML_VERBOSE(10, ("Blocking NB allreduce is done"));
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_ml_allreduce_dispatch(void *sbuf, void *rbuf, int count,
+                                   struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+                                   struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
+{
+    int rc;
+    bool use_extra_topo;
+    ompi_request_t *req;
+
+    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
+
+    use_extra_topo = (count > 1) ?
+            !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] :
+            !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE];
+
+    if (use_extra_topo) {
+        rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
+                                         op, comm, ml_module, &req,
+                                         ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
+                                         ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE);
+    } else {
+        rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
+                                         op, comm, ml_module, &req,
+                                         ML_SMALL_DATA_ALLREDUCE,
+                                         ML_LARGE_DATA_ALLREDUCE);
+    }
+
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        ML_ERROR(("Failed to launch"));
+        return rc;
+    }
+
+    ompi_request_wait_completion(req);
+    ompi_request_free(&req);
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_ml_allreduce_dispatch_nb(void *sbuf, void *rbuf, int count,
+                                   ompi_datatype_t *dtype, ompi_op_t *op,
+                                   ompi_communicator_t *comm,
+                                   ompi_request_t **req,
+                                   mca_coll_base_module_t *module)
+{
+    int rc;
+    bool use_extra_topo;
+
+    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
+
+    use_extra_topo = (count > 1) ?
+            !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_MULTI_ELEM_TYPE] :
+            !ml_module->allreduce_matrix[op->op_type][dtype->id][BCOL_SINGLE_ELEM_TYPE];
+
+    if (use_extra_topo) {
+        rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
+                                         op, comm, ml_module, req,
+                                         ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE,
+                                         ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE);
+    } else {
+        rc = parallel_allreduce_start(sbuf, rbuf, count, dtype,
+                                         op, comm, ml_module, req,
+                                         ML_SMALL_DATA_ALLREDUCE,
+                                         ML_LARGE_DATA_ALLREDUCE);
+    }
+
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        ML_ERROR(("Failed to launch"));
+        return rc;
+    }
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/ml/coll_ml_bcast.c b/ompi/mca/coll/ml/coll_ml_bcast.c
index 1d4182ee60..871c201d42 100644
--- a/ompi/mca/coll/ml/coll_ml_bcast.c
+++ b/ompi/mca/coll/ml/coll_ml_bcast.c
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -193,35 +196,33 @@ static int mca_coll_ml_bcast_frag_converter_progress(mca_coll_ml_collective_oper
 
         /* Get an ml buffer */
         src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
-        if (NULL == src_buffer_desc) {
+        if (OPAL_UNLIKELY(NULL == src_buffer_desc)) {
             /* If there exist outstanding fragments, then break out
              * and let an active fragment deal with this later,
              * there are no buffers available.
              */
             if (0 < coll_op->fragment_data.message_descriptor->n_active) {
                 return OMPI_SUCCESS;
-            } else {
-                /* It is useless to call progress from here, since
-                 * ml progress can't be executed as result ml memsync
-                 * call will not be completed and no memory will be
-                 * recycled. So we put the element on the list, and we will
-                 * progress it later when memsync will recycle some memory*/
-                if (NULL == src_buffer_desc) {
-                    /* The fragment is already on list and
-                     * the we still have no ml resources
-                     * Return busy */
-                    if (coll_op->pending & REQ_OUT_OF_MEMORY) {
-                        return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
-                    }
-
-                    coll_op->pending |= REQ_OUT_OF_MEMORY;
-                    opal_list_append(&ml_module->waiting_for_memory_list,
-                                    (opal_list_item_t *)coll_op);
-
-                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
-                }
             }
+
+            /* It is useless to call progress from here, since
+             * ml progress can't be executed as result ml memsync
+             * call will not be completed and no memory will be
+             * recycled. So we put the element on the list, and we will
+             * progress it later when memsync will recycle some memory*/
+
+            /* The fragment is already on list and
+             * the we still have no ml resources
+             * Return busy */
+            if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
+              coll_op->pending |= REQ_OUT_OF_MEMORY;
+              opal_list_append(&ml_module->waiting_for_memory_list,
+                               (opal_list_item_t *)coll_op);
+            }
+
+            return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
         }
+
         /* Get a new collective descriptor and initialize it */
         new_op = mca_coll_ml_duplicate_op_prog_single_frag_dag
             (ml_module, coll_op);
@@ -237,6 +238,7 @@ static int mca_coll_ml_bcast_frag_converter_progress(mca_coll_ml_collective_oper
             iov.iov_len  = ml_module->ml_fragment_size;
             assert(0 != iov.iov_len);
 
+            max_data = ml_module->small_message_thresholds[BCOL_BCAST];
             opal_convertor_pack(&new_op->fragment_data.message_descriptor->send_convertor,
                                 &iov, &iov_count, &max_data);
 
@@ -256,7 +258,8 @@ static int mca_coll_ml_bcast_frag_converter_progress(mca_coll_ml_collective_oper
                 coll_ml_bcast_functions[new_op->fragment_data.current_coll_op]->
                 task_setup_fn[COLL_ML_GENERAL_TASK_FN];
 
-             mca_coll_ml_convertor_get_send_frag_size(
+            max_data = ml_module->small_message_thresholds[BCOL_BCAST];
+            mca_coll_ml_convertor_get_send_frag_size(
                                     ml_module, &max_data,
                                     new_op->fragment_data.message_descriptor);
         }
@@ -339,28 +342,27 @@ static int mca_coll_ml_bcast_frag_progress(mca_coll_ml_collective_operation_prog
              */
             if (0 < coll_op->fragment_data.message_descriptor->n_active) {
                 return OMPI_SUCCESS;
-            } else {
-                /* It is useless to call progress from here, since
-                 * ml progress can't be executed as result ml memsync
-                 * call will not be completed and no memory will be
-                 * recycled. So we put the element on the list, and we will
-                 * progress it later when memsync will recycle some memory*/
+            }
 
-                /* The fragment is already on list and
-                 * the we still have no ml resources
-                 * Return busy */
-                if (coll_op->pending & REQ_OUT_OF_MEMORY) {
-                    ML_VERBOSE(10,("Out of resources %p", coll_op));
-                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
-                }
+            /* It is useless to call progress from here, since
+             * ml progress can't be executed as result ml memsync
+             * call will not be completed and no memory will be
+             * recycled. So we put the element on the list, and we will
+             * progress it later when memsync will recycle some memory*/
 
+            /* The fragment is already on list and
+             * the we still have no ml resources
+             * Return busy */
+            if (!(coll_op->pending & REQ_OUT_OF_MEMORY)) {
+                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
                 coll_op->pending |= REQ_OUT_OF_MEMORY;
                 opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
                                 (opal_list_item_t *) coll_op);
-
-                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
-                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+            } else {
+                ML_VERBOSE(10,("Out of resources %p", coll_op));
             }
+
+            return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
         }
 
         /* Get a new collective descriptor and initialize it */
@@ -448,8 +450,12 @@ static inline __opal_attribute_always_inline__
     mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
     ml_payload_buffer_desc_t *src_buffer_desc = NULL;
     mca_coll_ml_task_setup_fn_t task_setup;
+    OPAL_PTRDIFF_TYPE lb, extent;
 
-    ML_VERBOSE(10, ("Starting bcast, mca_coll_ml_bcast_uknown_root"));
+    /* actual starting place of the user buffer (lb added) */
+    void *actual_buf;
+
+    ML_VERBOSE(10, ("Starting bcast, mca_coll_ml_bcast_uknown_root buf: %p", buf));
 
     ompi_datatype_type_size(dtype, &dt_size);
     pack_len = count * dt_size;
@@ -459,6 +465,10 @@ static inline __opal_attribute_always_inline__
     /* Get information about memory layout */
     contig = opal_datatype_is_contiguous_memory_layout((opal_datatype_t *)dtype, count);
 
+    ompi_datatype_get_extent (dtype, &lb, &extent);
+
+    actual_buf = (void *) ((uintptr_t) buf + lb);
+
     /* Allocate collective schedule and pack message */
     if (contig) {
         if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_BCAST]) {
@@ -466,9 +476,8 @@ static inline __opal_attribute_always_inline__
             bcast_index = ml_module->bcast_fn_index_table[SMALL_BCAST];
 
             ML_VERBOSE(10, ("Contig + small message %d [0-sk, 1-lk, 3-su, 4-lu]\n", bcast_index));
-            ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op,
-                    bcast_index, root,
-                    pack_len, pack_len, buf, src_buffer_desc);
+            ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len,
+                                                pack_len, actual_buf, src_buffer_desc);
 
             ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, count, dtype,
                     src_buffer_desc, 0, 0, ml_module->payload_block->size_buffer,
@@ -490,9 +499,8 @@ static inline __opal_attribute_always_inline__
             n_fragments = (pack_len + dt_size*n_dts_per_frag - 1)/(dt_size*n_dts_per_frag);
             pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);
 
-            ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op,
-                    bcast_index, root,
-                    pack_len, frag_len, buf, src_buffer_desc);
+            ALLOCATE_AND_PACK_CONTIG_BCAST_FRAG(ml_module, coll_op, bcast_index, root, pack_len,
+                                                frag_len, actual_buf, src_buffer_desc);
             ML_SET_VARIABLE_PARAMS_BCAST(coll_op, ml_module, (frag_len/dt_size), dtype,
                     src_buffer_desc, 0, 0, frag_len, (src_buffer_desc->data_addr));
 
@@ -500,7 +508,6 @@ static inline __opal_attribute_always_inline__
             coll_op->full_message.pipeline_depth = pipeline_depth;
             /* Initialize fragment specific information */
             coll_op->fragment_data.current_coll_op = bcast_index;
-            coll_op->fragment_data.buffer_desc = src_buffer_desc;
             /* coll_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; */
             coll_op->fragment_data.fragment_size = frag_len;
             coll_op->fragment_data.message_descriptor->n_active++;
@@ -515,10 +522,9 @@ static inline __opal_attribute_always_inline__
             ML_VERBOSE(10, ("Contig + zero copy %d [0-sk, 1-lk, 3-su, 4-lu]\n", bcast_index));
 
             coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
-                    ml_module->coll_ml_bcast_functions[bcast_index],
-                    buf, buf,
-                    pack_len,
-                    0 /* offset for first pack */);
+                                                                ml_module->coll_ml_bcast_functions[bcast_index],
+                                                                actual_buf, actual_buf, pack_len,
+                                                                0 /* offset for first pack */);
             /* For large messages (bcast) this points to userbuf */
             /* Pasha: temporary work around for basesmuma, userbuf should
                be removed  */
@@ -536,10 +542,9 @@ static inline __opal_attribute_always_inline__
         ML_VERBOSE(10, ("NON Contig + fragmentation %d [0-sk, 1-lk, 3-su, 4-lu]\n", bcast_index));
 
         coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
-                ml_module->coll_ml_bcast_functions[bcast_index],
-                buf, buf,
-                pack_len,
-                0 /* offset for first pack */);
+                                                            ml_module->coll_ml_bcast_functions[bcast_index],
+                                                            actual_buf, actual_buf, pack_len,
+                                                            0 /* offset for first pack */);
         if (OPAL_LIKELY(pack_len > 0)) {
             size_t max_data = 0;
 
@@ -560,10 +565,9 @@ static inline __opal_attribute_always_inline__
 
                 iov.iov_base = (IOVBASE_TYPE*) src_buffer_desc->data_addr;
                 iov.iov_len  = ml_module->ml_fragment_size;
-
+                max_data = ml_module->small_message_thresholds[BCOL_BCAST];
                 opal_convertor_pack(&coll_op->full_message.send_convertor,
                                     &iov, &iov_count, &max_data);
-
                 coll_op->process_fn = NULL;
                 coll_op->full_message.n_bytes_scheduled = max_data;
 
@@ -571,6 +575,7 @@ static inline __opal_attribute_always_inline__
                 coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress;
                 coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth;
                 coll_op->full_message.root = true;
+
             } else {
                 opal_convertor_copy_and_prepare_for_send(
                         ompi_mpi_local_convertor,
@@ -597,6 +602,7 @@ static inline __opal_attribute_always_inline__
                 coll_op->full_message.fragment_launcher = mca_coll_ml_bcast_frag_converter_progress;
                 coll_op->full_message.pipeline_depth = mca_coll_ml_component.pipeline_depth;
 
+                max_data = ml_module->small_message_thresholds[BCOL_BCAST];
                 coll_op->full_message.dummy_conv_position = 0;
                 mca_coll_ml_convertor_get_send_frag_size(
                                              ml_module, &max_data,
@@ -605,7 +611,6 @@ static inline __opal_attribute_always_inline__
                 coll_op->full_message.n_bytes_scheduled = max_data;
             }
         }
-
         coll_op->fragment_data.current_coll_op = bcast_index;
         coll_op->fragment_data.message_descriptor->n_active++;
         coll_op->fragment_data.fragment_size = coll_op->full_message.n_bytes_scheduled;
@@ -675,9 +680,9 @@ int mca_coll_ml_parallel_bcast(void *buf, int count, struct ompi_datatype_t *dty
 }
 
 int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *dtype,
-        int root, struct ompi_communicator_t *comm,
-        ompi_request_t **req,
-        mca_coll_base_module_t *module)
+                                  int root, struct ompi_communicator_t *comm,
+                                  ompi_request_t **req,
+                                  mca_coll_base_module_t *module)
 {
     int ret;
 
@@ -693,8 +698,8 @@ int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *
 }
 
 int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype_t *dtype,
-                                         int root, struct ompi_communicator_t *comm,
-                                         mca_coll_base_module_t *module)
+                                      int root, struct ompi_communicator_t *comm,
+                                      mca_coll_base_module_t *module)
 {
 
     /* local variables */
@@ -707,6 +712,10 @@ int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype
     mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;
     ml_payload_buffer_desc_t *src_buffer_desc = NULL;
     mca_bcol_base_coll_fn_desc_t *func;
+    OPAL_PTRDIFF_TYPE lb, extent;
+
+    /* actual starting place of the user buffer (lb added) */
+    void *actual_buf;
 
     ML_VERBOSE(10, ("Starting static bcast, small messages"));
 
@@ -716,6 +725,8 @@ int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype
     ompi_datatype_type_size(dtype, &dt_size);
     pack_len = count * dt_size;
 
+    actual_buf = (void *) ((uintptr_t) buf + lb);
+
     /* Setup data buffer */
     src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
     while (NULL == src_buffer_desc) {
@@ -729,10 +740,9 @@ int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype
         assert(pack_len <=  ml_module->payload_block->size_buffer);
 
         coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
-                ml_module->coll_ml_bcast_functions[ML_BCAST_SMALL_DATA_SEQUENTIAL],
-                buf, buf,
-                pack_len,
-                0 /* offset for first pack */);
+                                                            ml_module->coll_ml_bcast_functions[ML_BCAST_SMALL_DATA_SEQUENTIAL],
+                                                            actual_buf, actual_buf, pack_len,
+                                                            0 /* offset for first pack */);
         if (ompi_comm_rank(comm) == root) {
             /* single frag, pack the data */
             memcpy((void *)(uintptr_t)src_buffer_desc->data_addr,
@@ -748,15 +758,14 @@ int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype
     } else {
         ML_VERBOSE(10, ("ML_BCAST_LARGE_DATA_KNOWN case."));
         coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
-                ml_module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_SEQUENTIAL],
-                buf, buf,
-                pack_len,
-                0 /* offset for first pack */);
+                                                            ml_module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_SEQUENTIAL],
+                                                            actual_buf, actual_buf, pack_len,
+                                                            0 /* offset for first pack */);
         /* For large messages (bcast) this points to userbuf */
         /* Pasha: temporary work around for basesmuma, userbuf should
            be removed  */
         coll_op->variable_fn_params.userbuf =
-        coll_op->variable_fn_params.sbuf = buf;
+        coll_op->variable_fn_params.sbuf = actual_buf;
 
         coll_op->process_fn = NULL;
     }
diff --git a/ompi/mca/coll/ml/coll_ml_colls.h b/ompi/mca/coll/ml/coll_ml_colls.h
index 01cc50ab2e..037773b3e3 100644
--- a/ompi/mca/coll/ml/coll_ml_colls.h
+++ b/ompi/mca/coll/ml/coll_ml_colls.h
@@ -288,8 +288,8 @@ struct mca_coll_ml_collective_operation_progress_t {
         int64_t send_count;
         int64_t recv_count;
         /* extent of the data types */
-        int32_t send_extent;
-        int32_t recv_extent;
+        size_t send_extent;
+        size_t recv_extent;
         /* send data type */
         struct ompi_datatype_t * send_data_type;
         /* needed for non-contigous buffers */
@@ -309,9 +309,11 @@ struct mca_coll_ml_collective_operation_progress_t {
         size_t recv_converter_bytes_packed;
         /* In case if ordering is needed: order num for next frag */
         int next_frag_num;
-        /* The variable is used by non-blocking memory synchronization code 
+        /* The variable is used by non-blocking memory synchronization code
          * for caching bank index */
         int bank_index_to_recycle;
+        /* need a handle for collective progress e.g. alltoall*/
+        bcol_fragment_descriptor_t frag_info;
     } full_message;
 
     /* collective operation being progressed */
@@ -347,6 +349,8 @@ struct mca_coll_ml_collective_operation_progress_t {
 
         /* ML buffer descriptor attached to this buffer */
         struct ml_payload_buffer_desc_t *buffer_desc;
+        /* handle for collective progress, e.g. alltoall */
+        bcol_fragment_descriptor_t bcol_fragment_desc;
 
         /* Which collective algorithm */
         int current_coll_op;
@@ -359,6 +363,7 @@ struct mca_coll_ml_collective_operation_progress_t {
      * function is exited, and for nonblocking collective functions this
      * is until test or wait completes the collective.
      */
+    int global_root;
     bcol_function_args_t variable_fn_params;
 
     struct{
@@ -407,9 +412,8 @@ do {
         /* back to the free list  (free list may release memory on distruct )*/ \
         struct ompi_communicator_t *comm = GET_COMM(op);                        \
         bool is_coll_sync = IS_COLL_SYNCMEM(op);                                \
-        assert(&(op)->full_message !=                                           \
-               (op)->fragment_data.message_descriptor);                         \
         ML_VERBOSE(10, ("Releasing %p", op));                                   \
+        OMPI_REQUEST_FINI(&(op)->full_message.super);                   \
         OMPI_FREE_LIST_RETURN_MT(&(((mca_coll_ml_module_t *)(op)->coll_module)->   \
                     coll_ml_collective_descriptors),                            \
                 (ompi_free_list_item_t *)op);                                   \
@@ -468,5 +472,66 @@ do {
     }                                                                           \
 } while (0)
 
+enum {
+    MCA_COLL_ML_NET_STREAM_SEND,
+    MCA_COLL_ML_NET_STREAM_RECV
+};
+
+static inline  __opal_attribute_always_inline__
+    int mca_coll_ml_convertor_prepare(ompi_datatype_t *dtype, int count, void *buff,
+                                            opal_convertor_t *convertor, int stream)
+{
+    size_t bytes_packed;
+
+    if (MCA_COLL_ML_NET_STREAM_SEND == stream) {
+        opal_convertor_copy_and_prepare_for_send(
+                ompi_mpi_local_convertor,
+                &dtype->super, count, buff, 0,
+                convertor);
+    } else {
+        opal_convertor_copy_and_prepare_for_recv(
+                ompi_mpi_local_convertor,
+                &dtype->super, count, buff, 0,
+                convertor);
+    }
+
+    opal_convertor_get_packed_size(convertor, &bytes_packed);
+
+    return bytes_packed;
+}
+
+static inline  __opal_attribute_always_inline__
+    int mca_coll_ml_convertor_pack(void *data_addr, size_t buff_size,
+                                            opal_convertor_t *convertor)
+{
+    struct iovec iov;
+
+    size_t max_data = 0;
+    uint32_t iov_count = 1;
+
+    iov.iov_base = (IOVBASE_TYPE*) data_addr;
+    iov.iov_len  = buff_size;
+
+    opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
+
+    return max_data;
+}
+
+static inline  __opal_attribute_always_inline__
+    int mca_coll_ml_convertor_unpack(void *data_addr, size_t buff_size,
+                                            opal_convertor_t *convertor)
+{
+    struct iovec iov;
+
+    size_t max_data = 0;
+    uint32_t iov_count = 1;
+
+    iov.iov_base = (void *) (uintptr_t) data_addr;
+    iov.iov_len  = buff_size;
+
+    opal_convertor_unpack(convertor, &iov, &iov_count, &max_data);
+
+    return max_data;
+}
 #endif /* MCA_COLL_ML_COLLS_H */
 
diff --git a/ompi/mca/coll/ml/coll_ml_component.c b/ompi/mca/coll/ml/coll_ml_component.c
index 2c0c96a63e..54f16701a5 100644
--- a/ompi/mca/coll/ml/coll_ml_component.c
+++ b/ompi/mca/coll/ml/coll_ml_component.c
@@ -2,6 +2,8 @@
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -60,54 +62,35 @@ mca_coll_ml_component_t mca_coll_ml_component = {
 
     /* First, fill in the super */
 
-    {
+    .super = {
         /* First, the mca_component_t struct containing meta
            information about the component itself */
 
-        {
+        .collm_version = {
             MCA_COLL_BASE_VERSION_2_0_0,
 
             /* Component name and version */
 
-            "ml",
-            OMPI_MAJOR_VERSION,
-            OMPI_MINOR_VERSION,
-            OMPI_RELEASE_VERSION,
+            .mca_component_name = "ml",
+            .mca_component_major_version = OMPI_MAJOR_VERSION,
+            .mca_component_minor_version = OMPI_MINOR_VERSION,
+            .mca_component_release_version = OMPI_RELEASE_VERSION,
 
-            /* Component open and close functions */
+            /* Component open, close, and register functions */
 
-            ml_open,
-            ml_close,
+            .mca_open_component = ml_open,
+            .mca_close_component = ml_close,
             .mca_register_component_params = mca_coll_ml_register_params
         },
-        {
+        .collm_data = {
             /* The component is not checkpoint ready */
             MCA_BASE_METADATA_PARAM_NONE
         },
 
         /* Initialization / querying functions */
-        mca_coll_ml_init_query,
-        mca_coll_ml_comm_query,
+        .collm_init_query = mca_coll_ml_init_query,
+        .collm_comm_query = mca_coll_ml_comm_query,
     },
-
-    /* ml-component specifc information */
-
-    /* (default) priority */
-    0,
-    /* Number of levels */
-    0,
-    /* subgrouping components to use */
-    NULL,
-    /* basic collectives components to use */
-    NULL,
-    /* verbose */
-    0,
-    /* max of communicators */
-    0,
-    /* min size of comm */
-    0,
-    /* base_sequence_number */
-    0,
 };
 
 void mca_coll_ml_abort_ml(char *message)
@@ -160,124 +143,63 @@ static int coll_ml_progress()
     /* progress sequential collective operations */
     /* RLG - need to do better here for parallel progress */
     OPAL_THREAD_LOCK(&(cm->sequential_collectives_mutex));
-    for (seq_coll_op  = (mca_coll_ml_collective_operation_progress_t *)opal_list_get_first(SEQ_L);
-         seq_coll_op != (mca_coll_ml_collective_operation_progress_t *)opal_list_get_end(SEQ_L);
-         seq_coll_op  = (mca_coll_ml_collective_operation_progress_t *)opal_list_get_next((opal_list_item_t *)seq_coll_op)){
-
-        fn_idx      = seq_coll_op->sequential_routine.current_active_bcol_fn;
-        /* initialize the task */
-
-        if(SEQ_TASK_IN_PROG == seq_coll_op->sequential_routine.current_bcol_status){
-            progress_fn = seq_coll_op->coll_schedule->
-                component_functions[fn_idx].bcol_function->progress_fn;
-        }else{
-            /* PPP Pasha - apparently task setup should be called only here. see linr 190 */
-            progress_fn = seq_coll_op->coll_schedule->
-                            component_functions[fn_idx].bcol_function->coll_fn;
-            seq_coll_op->sequential_routine.current_bcol_status =
-                SEQ_TASK_IN_PROG;
-        }
-
-        const_args  = &seq_coll_op->coll_schedule->component_functions[fn_idx].constant_group_data;
-        /* RLG - note need to move to useing coll_ml_utility_data_t as
-         * collective argument, rather than  coll_ml_function_t
-         */
-        rc = progress_fn(&(seq_coll_op->variable_fn_params), (coll_ml_function_t *)const_args);
-        if (BCOL_FN_COMPLETE == rc) {
-            /* done with this routine */
-            seq_coll_op->sequential_routine.current_active_bcol_fn++;
-            /* this is totally hardwired for bcast, need a general call-back */
-            /*seq_coll_op->variable_fn_params.root_flag = true;*/
+    OPAL_LIST_FOREACH_SAFE(seq_coll_op, seq_coll_op_tmp, SEQ_L, mca_coll_ml_collective_operation_progress_t) {
+        do {
             fn_idx      = seq_coll_op->sequential_routine.current_active_bcol_fn;
-            if (seq_coll_op->sequential_routine.current_active_bcol_fn  ==
-                    seq_coll_op->coll_schedule->n_fns) {
-                /* done with this collective - recycle descriptor */
+            /* initialize the task */
 
-                /* remove from the progress list */
-                seq_coll_op_tmp = (mca_coll_ml_collective_operation_progress_t *)
-                    opal_list_remove_item(SEQ_L, (opal_list_item_t *)seq_coll_op);
-
-                /* handle fragment completion */
-                rc = coll_ml_fragment_completion_processing(seq_coll_op);
-
-                if (OMPI_SUCCESS != rc) {
-                    mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
-                }
-                /* make sure that for will pick up right one */
-                seq_coll_op = seq_coll_op_tmp;
-            }else {
-                /* task setup */
-                /* Pasha - Another call for task setup ? Why ?*/
-                rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op);
-                /* else, start firing bcol functions */
-                while(true) {
-
-                    fn_idx      = seq_coll_op->sequential_routine.current_active_bcol_fn;
-                    const_args  = &seq_coll_op->coll_schedule->
-                        component_functions[fn_idx].constant_group_data;
-                    coll_fn = seq_coll_op->coll_schedule->
-                        component_functions[fn_idx].bcol_function->coll_fn;
-                    rc = coll_fn(&seq_coll_op->variable_fn_params,
-                            (coll_ml_function_t *) const_args);
-
-                    if (BCOL_FN_COMPLETE == rc) {
-
-                        seq_coll_op->sequential_routine.current_active_bcol_fn++;
-                        fn_idx  = seq_coll_op->sequential_routine.current_active_bcol_fn;
-
-                        /* done with this routine,
-                         * check for collective completion */
-                        if (seq_coll_op->sequential_routine.current_active_bcol_fn  ==
-                                seq_coll_op->coll_schedule->n_fns) {
-                            /* remove from the progress list */
-                            seq_coll_op_tmp = (mca_coll_ml_collective_operation_progress_t *)
-                                opal_list_remove_item(SEQ_L, (opal_list_item_t *)seq_coll_op);
-
-                            /* handle fragment completion */
-                            rc = coll_ml_fragment_completion_processing(seq_coll_op);
-                            if (OMPI_SUCCESS != rc) {
-                                mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
-                            }
-
-                            /* make sure that for will pick up right one */
-                            seq_coll_op = seq_coll_op_tmp;
-
-                            /* break out of while loop */
-                            break;
-                        }else {
-                            /* setup the next task */
-                            /* sequential task setup */
-                            seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op);
-                        }
-
-                    }else if (BCOL_FN_NOT_STARTED == rc) {
-
-                        seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
-
-                        break;
-                    } else {
-                        break;
-                    }
-
-                }
+            if (SEQ_TASK_IN_PROG == seq_coll_op->sequential_routine.current_bcol_status){
+                progress_fn = seq_coll_op->coll_schedule->
+                    component_functions[fn_idx].bcol_function->progress_fn;
+            } else {
+                /* PPP Pasha - apparently task setup should be called only here. see linr 190 */
+                progress_fn = seq_coll_op->coll_schedule->
+                    component_functions[fn_idx].bcol_function->coll_fn;
             }
 
+            const_args  = &seq_coll_op->coll_schedule->component_functions[fn_idx].constant_group_data;
+            /* RLG - note need to move to useing coll_ml_utility_data_t as
+             * collective argument, rather than  coll_ml_function_t
+             */
+            rc = progress_fn(&(seq_coll_op->variable_fn_params), (coll_ml_function_t *)const_args);
+            if (BCOL_FN_COMPLETE == rc) {
+                /* done with this routine */
+                seq_coll_op->sequential_routine.current_active_bcol_fn++;
+                /* this is totally hardwired for bcast, need a general call-back */
 
-        } else if (BCOL_FN_NOT_STARTED == rc ){
-            seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
-        }
+                fn_idx = seq_coll_op->sequential_routine.current_active_bcol_fn;
+                if (fn_idx == seq_coll_op->coll_schedule->n_fns) {
+                    /* done with this collective - recycle descriptor */
 
+                    /* remove from the progress list */
+                    (void) opal_list_remove_item(SEQ_L, (opal_list_item_t *)seq_coll_op);
+
+                    /* handle fragment completion */
+                    rc = coll_ml_fragment_completion_processing(seq_coll_op);
+
+                    if (OMPI_SUCCESS != rc) {
+                        mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
+                    }
+                } else {
+                    rc = seq_coll_op->sequential_routine.seq_task_setup(seq_coll_op);
+                    seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
+                    continue;
+                }
+            } else if (BCOL_FN_NOT_STARTED == rc) {
+                seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
+            } else if (BCOL_FN_STARTED == rc) {
+                seq_coll_op->sequential_routine.current_bcol_status = SEQ_TASK_IN_PROG;
+            }
+
+            break;
+        } while (true);
     }
     OPAL_THREAD_UNLOCK(&(cm->sequential_collectives_mutex));
 
     /* general dag's */
     /* see if active tasks can be progressed */
     OPAL_THREAD_LOCK(&(cm->active_tasks_mutex));
-    for (task_status  = (mca_coll_ml_task_status_t *)opal_list_get_first(ACTIVE_L);
-         task_status != (mca_coll_ml_task_status_t *)opal_list_get_end(ACTIVE_L);
-         task_status  = (mca_coll_ml_task_status_t *)opal_list_get_next(task_status)
-        )
-    {
+    OPAL_LIST_FOREACH(task_status, ACTIVE_L, mca_coll_ml_task_status_t) {
         /* progress task */
         progress_fn = task_status->bcol_fn->progress_fn;
         const_args = &task_status->ml_coll_operation->coll_schedule->
@@ -300,10 +222,7 @@ static int coll_ml_progress()
 
     /* see if new tasks can be initiated */
     OPAL_THREAD_LOCK(&(cm->pending_tasks_mutex));
-    for (task_status  = (mca_coll_ml_task_status_t *) opal_list_get_first(PENDING_L);
-         task_status != (mca_coll_ml_task_status_t *) opal_list_get_end(PENDING_L);
-         task_status  = (mca_coll_ml_task_status_t *) opal_list_get_next(task_status))
-    {
+    OPAL_LIST_FOREACH_SAFE(task_status, task_status_tmp, PENDING_L, mca_coll_ml_task_status_t) {
         /* check to see if dependencies are satisfied */
         int n_dependencies = task_status->rt_num_dependencies;
         int n_dependencies_satisfied = task_status->n_dep_satisfied;
@@ -323,15 +242,13 @@ static int coll_ml_progress()
                 }
             } else if ( BCOL_FN_STARTED == rc ) {
                 ML_VERBOSE(3, ("GOT BCOL_STARTED!"));
-                task_status_tmp = (mca_coll_ml_task_status_t *)
-                    opal_list_remove_item(PENDING_L, (opal_list_item_t *)task_status);
+                (void) opal_list_remove_item(PENDING_L, (opal_list_item_t *)task_status);
                 /* RLG - is there potential for deadlock here ?  Need to
                  * look at this closely
                  */
                 OPAL_THREAD_LOCK(&(cm->active_tasks_mutex));
                 opal_list_append(ACTIVE_L, (opal_list_item_t *)task_status);
                 OPAL_THREAD_UNLOCK(&(cm->active_tasks_mutex));
-                task_status = task_status_tmp;
             } else if( BCOL_FN_NOT_STARTED == rc ) {
                 /* nothing to do */
                 ML_VERBOSE(10, ("GOT BCOL_FN_NOT_STARTED!"));
@@ -342,6 +259,7 @@ static int coll_ml_progress()
                  * the way the code is implemented now */
                 ML_VERBOSE(3, ("GOT error !"));
                 rc = OMPI_ERROR;
+                OMPI_ERRHANDLER_RETURN(rc,MPI_COMM_WORLD,rc,"Error returned from bcol function: aborting");
                 break;
             }
         }
@@ -357,14 +275,11 @@ static int coll_ml_progress()
 
 static void adjust_coll_config_by_mca_param(void)
 {
-    assert(false == mca_coll_ml_component.use_static_bcast ||
-           false == mca_coll_ml_component.use_sequential_bcast);
-
     /* setting bcast mca params */
-    if (mca_coll_ml_component.use_static_bcast) {
+    if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) {
         mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_KNOWN;
         mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_KNOWN;
-    } else if (mca_coll_ml_component.use_sequential_bcast){
+    } else if (COLL_ML_SEQ_BCAST == mca_coll_ml_component.bcast_algorithm) {
         mca_coll_ml_component.coll_config[ML_BCAST][ML_SMALL_MSG].algorithm_id = ML_BCAST_SMALL_DATA_SEQUENTIAL;
         mca_coll_ml_component.coll_config[ML_BCAST][ML_LARGE_MSG].algorithm_id = ML_BCAST_LARGE_DATA_SEQUENTIAL;
     } else { /* Unknown root */
@@ -468,7 +383,7 @@ static int ml_close(void)
 
     mca_coll_ml_component_t *cs = &mca_coll_ml_component;
 
-    /* There is not need to release/close resource if the 
+    /* There is not need to release/close resource if the
      * priority was set to zero */
     if (cs->ml_priority <= 0) {
         return OMPI_SUCCESS;
diff --git a/ompi/mca/coll/ml/coll_ml_config.c b/ompi/mca/coll/ml/coll_ml_config.c
index 7d8bba859a..bd3c35593f 100644
--- a/ompi/mca/coll/ml/coll_ml_config.c
+++ b/ompi/mca/coll/ml/coll_ml_config.c
@@ -73,6 +73,14 @@ static int algorithm_name_to_id(char *name)
         return ML_SMALL_DATA_ALLREDUCE;
     if (!strcasecmp(name,"ML_LARGE_DATA_ALLREDUCE"))
         return ML_LARGE_DATA_ALLREDUCE;
+    if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE"))
+        return ML_SMALL_DATA_ALLREDUCE;
+    if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE"))
+        return ML_LARGE_DATA_ALLREDUCE;
+    if (!strcasecmp(name,"ML_SMALL_DATA_REDUCE"))
+        return ML_SMALL_DATA_REDUCE;
+    if (!strcasecmp(name,"ML_LARGE_DATA_REDUCE"))
+        return ML_LARGE_DATA_REDUCE;
     if (!strcasecmp(name,"ML_NUM_ALLREDUCE_FUNCTIONS"))
         return ML_NUM_ALLREDUCE_FUNCTIONS;
     if (!strcasecmp(name,"ML_SMALL_DATA_ALLTOALL"))
diff --git a/ompi/mca/coll/ml/coll_ml_functions.h b/ompi/mca/coll/ml/coll_ml_functions.h
index f5b50e7610..5d0d0d7b1a 100644
--- a/ompi/mca/coll/ml/coll_ml_functions.h
+++ b/ompi/mca/coll/ml/coll_ml_functions.h
@@ -86,6 +86,17 @@ enum {
     ML_NUM_ALLREDUCE_FUNCTIONS
 };
 
+/* Reduce functions */
+enum {
+    /* small data algorithm */
+    ML_SMALL_DATA_REDUCE,
+
+    /* Large data algorithm */
+    ML_LARGE_DATA_REDUCE,
+
+    /* number of functions */
+    ML_NUM_REDUCE_FUNCTIONS
+};
 /* Alltoall functions */
 enum {
     /* small data algorithm */
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms.c
index 4f9a6661a4..2a5d8bb52d 100644
--- a/ompi/mca/coll/ml/coll_ml_hier_algorithms.c
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms.c
@@ -78,37 +78,32 @@ int ml_coll_schedule_setup(mca_coll_ml_module_t *ml_module)
     }
 
     /* Allreduce */
-    /* 
     if (!mca_coll_ml_component.use_knomial_allreduce) { 
 		ret = ml_coll_hier_allreduce_setup(ml_module); 
 	} else {
 		ret = ml_coll_hier_allreduce_setup_new(ml_module);
 	}
-    */
 
 	if( OMPI_SUCCESS != ret ) {
         return ret;
     }
-   
+
 
     /* Alltoall */
-    /* 
+    /*
     ret = ml_coll_hier_alltoall_setup_new(ml_module);
     
     if( OMPI_SUCCESS != ret ) {
         return ret;
     }
     */
-    
-    
+
     /* Allgather */
-    /*
     ret = ml_coll_hier_allgather_setup(ml_module);
     
     if( OMPI_SUCCESS != ret ) {
         return ret;
     }
-    */
     
     /* Gather */
     /*
@@ -120,12 +115,10 @@ int ml_coll_schedule_setup(mca_coll_ml_module_t *ml_module)
     */
 
     /* Reduce */
-    
     ret = ml_coll_hier_reduce_setup(ml_module);
     if( OMPI_SUCCESS != ret ) {
         return ret;
     }
-    
 
     /* Scatter */
     /*
@@ -134,6 +127,7 @@ int ml_coll_schedule_setup(mca_coll_ml_module_t *ml_module)
         return ret;
     }
     */
+
     ret = ml_coll_memsync_setup(ml_module);
     if( OMPI_SUCCESS != ret ) {
         return ret;
@@ -150,7 +144,9 @@ int ml_coll_schedule_setup(mca_coll_ml_module_t *ml_module)
     /* Pasha: Do we have to keep the max_dag_size ? 
        In most generic case, it will be equal to max_fn_calls */
     ml_module->max_dag_size = ml_module->max_fn_calls;
+
     assert(ml_module->max_dag_size > 0);
+
     /* initialize the mca_coll_ml_collective_operation_progress_t free list */
     /* NOTE: as part of initialization each routine needs to make sure that
      * the module element max_dag_size is set large enough - space for
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c
new file mode 100644
index 0000000000..074cbc658c
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allgather_setup.c
@@ -0,0 +1,193 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml_functions.h"
+#include "ompi/mca/coll/ml/coll_ml_hier_algorithms_common_setup.h"
+#include "ompi/patterns/net/netpatterns_knomial_tree.h"
+
+#define SMALL_MSG_RANGE 1
+#define LARGE_MSG_RANGE 5
+
+static int mca_coll_ml_build_allgather_schedule(mca_coll_ml_topology_t *topo_info,
+                                                mca_coll_ml_collective_operation_description_t **coll_desc, int bcol_func_index)
+{
+    int ret; /* exit code in case of error */
+    int nfn = 0;
+    int i;
+    int *scratch_indx = NULL,
+        *scratch_num = NULL;
+
+    mca_coll_ml_collective_operation_description_t  *schedule = NULL;
+    mca_coll_ml_compound_functions_t *comp_fn;
+    mca_coll_ml_schedule_hier_info_t h_info;
+
+    ML_VERBOSE(9, ("Setting hierarchy, inputs : n_levels %d, hiest %d ",
+                   topo_info->n_levels, topo_info->global_highest_hier_group_index));
+    MCA_COLL_ML_INIT_HIER_INFO(h_info, topo_info->n_levels,
+                               topo_info->global_highest_hier_group_index, topo_info);
+
+    ret = mca_coll_ml_schedule_init_scratch(topo_info, &h_info,
+                                            &scratch_indx, &scratch_num);
+    if (OMPI_SUCCESS != ret) {
+        ML_ERROR(("Can't mca_coll_ml_schedule_init_scratch.\n"));
+        goto Error;
+    }
+    assert(NULL != scratch_indx);
+    assert(NULL != scratch_num);
+
+    schedule = *coll_desc =
+        mca_coll_ml_schedule_alloc(&h_info);
+    if (NULL == schedule) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Error;
+    }
+    /* Setting topology information */
+    schedule->topo_info = topo_info;
+
+    /* Set dependencies equal to number of hierarchies */
+    for (i = 0; i < h_info.num_up_levels; i++) {
+        int query_conf[MCA_COLL_ML_QUERY_SIZE];
+        MCA_COLL_ML_SET_QUERY(query_conf, DATA_SRC_KNOWN, BLOCKING, BCOL_GATHER, bcol_func_index, 0, 0);
+        comp_fn = &schedule->component_functions[i];
+        MCA_COLL_ML_SET_COMP_FN(comp_fn, i, topo_info,
+                                i, scratch_indx, scratch_num, query_conf, "GATHER_DATA");
+    }
+
+    nfn = i;
+    if (h_info.call_for_top_function) {
+        int query_conf[MCA_COLL_ML_QUERY_SIZE];
+        MCA_COLL_ML_SET_QUERY(query_conf, DATA_SRC_KNOWN, NON_BLOCKING, BCOL_ALLGATHER, bcol_func_index, 0, 0);
+        comp_fn = &schedule->component_functions[nfn];
+        MCA_COLL_ML_SET_COMP_FN(comp_fn, nfn, topo_info,
+                                nfn, scratch_indx, scratch_num, query_conf, "ALLGATHER_DATA");
+        ++nfn;
+    }
+
+    /* coming down the hierarchy */
+    for (i = h_info.num_up_levels - 1; i >= 0; i--, nfn++) {
+        int query_conf[MCA_COLL_ML_QUERY_SIZE];
+        MCA_COLL_ML_SET_QUERY(query_conf, DATA_SRC_KNOWN, NON_BLOCKING, BCOL_BCAST, bcol_func_index, 0, 0);
+        comp_fn = &schedule->component_functions[nfn];
+        MCA_COLL_ML_SET_COMP_FN(comp_fn, i, topo_info,
+                                nfn, scratch_indx, scratch_num, query_conf, "BCAST_DATA");
+    }
+
+    /* Fill the rest of constant data */
+    mca_coll_ml_call_types(&h_info, schedule);
+
+    MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);
+
+    free(scratch_num);
+    free(scratch_indx);
+
+    return OMPI_SUCCESS;
+
+ Error:
+    if (NULL != scratch_indx) {
+        free(scratch_indx);
+    }
+    if (NULL != scratch_num) {
+        free(scratch_num);
+    }
+    if (NULL != schedule->component_functions) {
+        free(schedule->component_functions);
+    }
+    return ret;
+}
+
+int ml_coll_hier_allgather_setup(mca_coll_ml_module_t *ml_module)
+{
+    /* Hierarchy Setup */
+    int ret, topo_index, alg;
+    mca_coll_ml_topology_t *topo_info = ml_module->topo_list;
+
+    ML_VERBOSE(10,("entering allgather setup\n"));
+
+#if 0
+    /* used to validate the recursive k - ing allgather tree */
+    {
+        /* debug print */
+        int ii, jj;
+        netpatterns_k_exchange_node_t exchange_node;
+
+        ret = netpatterns_setup_recursive_knomial_allgather_tree_node(8, 3, 3, &exchange_node);
+        fprintf(stderr,"log tree order %d tree_order %d\n", exchange_node.log_tree_order,exchange_node.tree_order);
+        if( EXCHANGE_NODE == exchange_node.node_type){
+            if( exchange_node.n_extra_sources > 0){
+                fprintf(stderr,"Receiving data from extra rank %d\n",exchange_node.rank_extra_sources_array[0]);
+            }
+            for( ii = 0; ii < exchange_node.log_tree_order; ii++){
+                for( jj = 0; jj < (exchange_node.tree_order-1); jj++) {
+                    if( exchange_node.rank_exchanges[ii][jj] >= 0){
+                        fprintf(stderr,"level %d I send %d bytes to %d from offset %d \n",ii+1,
+                                exchange_node.payload_info[ii][jj].s_len,
+                                exchange_node.rank_exchanges[ii][jj],
+                                exchange_node.payload_info[ii][jj].s_offset);
+                        fprintf(stderr,"level %d I receive %d bytes from %d at offset %d\n",ii+1,
+                                exchange_node.payload_info[ii][jj].r_len,
+                                exchange_node.rank_exchanges[ii][jj],
+                                exchange_node.payload_info[ii][jj].r_offset);
+                    }
+                }
+            }
+            fprintf(stderr,"exchange_node.n_extra_sources %d\n",exchange_node.n_extra_sources);
+            fprintf(stderr,"exchange_node.myid_reindex %d\n",exchange_node.reindex_myid);
+            if( exchange_node.n_extra_sources > 0){
+                fprintf(stderr,"Sending back data to extra rank %d\n",exchange_node.rank_extra_sources_array[0]);
+            }
+        } else {
+            fprintf(stderr,"I am an extra and send to proxy %d\n",
+                    exchange_node.rank_extra_sources_array[0]);
+        }
+    }
+#endif
+
+    alg = mca_coll_ml_component.coll_config[ML_ALLGATHER][ML_SMALL_MSG].algorithm_id;
+    topo_index = ml_module->collectives_topology_map[ML_ALLGATHER][alg];
+    if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) {
+        ML_ERROR(("No topology index or algorithm was defined"));
+        topo_info->hierarchical_algorithms[ML_ALLGATHER] = NULL;
+        return OMPI_ERROR;
+    }
+
+    ret = mca_coll_ml_build_allgather_schedule(&ml_module->topo_list[topo_index],
+                                               &ml_module->coll_ml_allgather_functions[alg],
+                                               SMALL_MSG_RANGE);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_VERBOSE(10, ("Failed to setup static alltoall"));
+        return ret;
+    }
+
+    alg = mca_coll_ml_component.coll_config[ML_ALLGATHER][ML_LARGE_MSG].algorithm_id;
+    topo_index = ml_module->collectives_topology_map[ML_ALLGATHER][alg];
+    if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) {
+        ML_ERROR(("No topology index or algorithm was defined"));
+        topo_info->hierarchical_algorithms[ML_ALLGATHER] = NULL;
+        return OMPI_ERROR;
+    }
+
+    ret = mca_coll_ml_build_allgather_schedule(&ml_module->topo_list[topo_index],
+                                               &ml_module->coll_ml_allgather_functions[alg],
+                                               LARGE_MSG_RANGE);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_VERBOSE(10, ("Failed to setup static alltoall"));
+        return ret;
+    }
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c
new file mode 100644
index 0000000000..17ce35cd02
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_allreduce_setup.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml_functions.h"
+
+#define ALLREDUCE_SMALL 1
+#define ALLREDUCE_LARGE 5
+#define SMALL_MSG_RANGE 1
+#define LARGE_MSG_RANGE 5
+
+static int mca_coll_ml_build_allreduce_schedule(
+        mca_coll_ml_topology_t *topo_info,
+        mca_coll_ml_collective_operation_description_t **coll_desc, int bcol_func_index)
+{
+
+    bool call_for_top_function, prev_is_zero;
+    int n_hiers = topo_info->n_levels;
+    int i_hier, j_hier;
+    int cnt, value_to_set = 0;
+    int ret; /* exit code in case of error */
+    int nfn=0;
+    int *scratch_indx = NULL,
+        *scratch_num = NULL;
+     int global_high_hierarchy_index =
+             topo_info->global_highest_hier_group_index;
+
+    mca_coll_ml_collective_operation_description_t  *schedule;
+    mca_coll_ml_compound_functions_t *comp_fn;
+    mca_bcol_base_module_t *prev_bcol,
+                           *bcol_module;
+    int num_up_levels,nbcol_functions,i;
+
+    if (global_high_hierarchy_index ==
+          topo_info->component_pairs[n_hiers - 1].bcol_index) {
+        /* The process that is member of highest level subgroup
+           should call for top algorithms in addition to fan-in/out steps*/
+        call_for_top_function = true;
+        /* hier level run only top algorithm, so we deduct 1 */
+        num_up_levels = n_hiers - 1;
+        /* Top algorithm is called only once, so we deduct 1 */
+        nbcol_functions = 2 * n_hiers - 1;
+    } else {
+        /* The process is not member of highest level subgroup,
+           as result it does not call for top algorithm,
+           but it calls for all fan-in/out steps */
+        call_for_top_function = false;
+        num_up_levels = n_hiers;
+        nbcol_functions = 2 * n_hiers;
+    }
+
+    *coll_desc = (mca_coll_ml_collective_operation_description_t *)
+        malloc(sizeof(mca_coll_ml_collective_operation_description_t));
+    schedule = *coll_desc;
+    if (NULL == schedule) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Allreduce_Setup_Error;
+    }
+
+    scratch_indx = (int *) malloc(sizeof(int) * (n_hiers * 2));
+    if (NULL == scratch_indx) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Allreduce_Setup_Error;
+    }
+
+    scratch_num = (int *) malloc(sizeof(int) * (n_hiers * 2));
+    if (NULL == scratch_num) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Allreduce_Setup_Error;
+    }
+
+    prev_bcol = NULL;
+
+    for (i = 0, cnt = 0; i < num_up_levels; ++i, ++cnt) {
+        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
+            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
+        } else {
+            scratch_indx[cnt] = 0;
+            prev_bcol = GET_BCOL(topo_info, i);
+        }
+    }
+
+    /* top  - only if the proc arrive to highest_level_is_global_highest_level */
+    if (call_for_top_function) {
+        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, n_hiers - 1))) {
+            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
+        } else {
+            scratch_indx[cnt] = 0;
+            prev_bcol = GET_BCOL(topo_info, n_hiers - 1);
+        }
+
+        ++cnt;
+    }
+
+    /* going down */
+    for (i = num_up_levels - 1; i >= 0; --i, ++cnt) {
+        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i))) {
+            scratch_indx[cnt] = scratch_indx[cnt - 1] + 1;
+        } else {
+            scratch_indx[cnt] = 0;
+            prev_bcol = GET_BCOL(topo_info, i);
+        }
+    }
+
+    i = cnt - 1;
+    prev_is_zero = true;
+
+    do {
+        if (prev_is_zero) {
+            value_to_set = scratch_indx[i] + 1;
+            prev_is_zero = false;
+        }
+
+        if (0 == scratch_indx[i]) {
+            prev_is_zero = true;
+        }
+
+        scratch_num[i] = value_to_set;
+        --i;
+    } while(i >= 0);
+
+    /* Set dependencies equal to number of hierarchies */
+    schedule->n_fns = nbcol_functions;
+    schedule->topo_info = topo_info;
+    schedule->progress_type = 0;
+
+    /* Allocated the component function */
+    schedule->component_functions = (struct mca_coll_ml_compound_functions_t *)
+            calloc(nbcol_functions, sizeof(struct mca_coll_ml_compound_functions_t));
+
+    if (NULL == schedule->component_functions) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Allreduce_Setup_Error;
+    }
+
+
+    for (i = 0; i < num_up_levels; i++) {
+        comp_fn = &schedule->component_functions[i];
+        comp_fn->h_level = i; /* hierarchy level */
+        bcol_module = GET_BCOL(topo_info, i);
+
+        /* strcpy (comp_fn->fn_name, "ALLREDUCE_SMALL_DATA"); */
+
+        comp_fn->num_dependent_tasks     = 0;
+        comp_fn->num_dependencies        = 0;
+
+        comp_fn->bcol_function =
+            bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_REDUCE][bcol_func_index][0][0];
+
+        comp_fn->task_comp_fn = NULL;
+
+        comp_fn->constant_group_data.bcol_module = bcol_module;
+        comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i];
+        comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i];
+        comp_fn->constant_group_data.n_of_this_type_in_collective = 0;
+        comp_fn->constant_group_data.index_of_this_type_in_collective = 0;
+    }
+
+    nfn = i;
+    if (call_for_top_function) {
+        comp_fn = &schedule->component_functions[nfn];
+        comp_fn->h_level = nfn; /* hierarchy level */
+        bcol_module = GET_BCOL(topo_info, nfn);
+
+        /* strcpy (comp_fn->fn_name, "ALLREDUCE_SMALL_DATA"); */
+
+        /* The allreduce should depend on the reduce */
+        comp_fn->num_dependent_tasks     = 0;
+        comp_fn->num_dependencies        = 0;
+        comp_fn->bcol_function =
+            bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_ALLREDUCE][bcol_func_index][0][0];
+
+        comp_fn->task_comp_fn = NULL;
+
+        comp_fn->constant_group_data.bcol_module = bcol_module;
+        comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[nfn];
+        comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[nfn];
+        comp_fn->constant_group_data.n_of_this_type_in_collective = 0;
+        comp_fn->constant_group_data.index_of_this_type_in_collective = 0;
+
+        ++nfn;
+    }
+
+    for (i = num_up_levels - 1; i >= 0; i--) {
+        comp_fn = &schedule->component_functions[nfn];
+        comp_fn->h_level = i; /* hierarchy level */
+        bcol_module = GET_BCOL(topo_info, i);
+
+    /*    strcpy (comp_fn->fn_name, "ALLREDUCE_SMALL_DATA"); */
+
+        comp_fn->num_dependent_tasks     = 0;
+        comp_fn->num_dependencies        = 0;
+
+        comp_fn->bcol_function =
+            bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_BCAST][bcol_func_index][0][0];
+
+        comp_fn->task_comp_fn = NULL;
+
+        comp_fn->constant_group_data.bcol_module = bcol_module;
+        comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[nfn];
+        comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[nfn];
+        comp_fn->constant_group_data.n_of_this_type_in_collective = 0;
+        comp_fn->constant_group_data.index_of_this_type_in_collective = 0;
+
+        ++nfn;
+    }
+
+    /* Fill the rest of constant data */
+    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
+        mca_bcol_base_module_t *current_bcol =
+            schedule->component_functions[i_hier].
+            constant_group_data.bcol_module;
+        cnt = 0;
+        for (j_hier = 0; j_hier < n_hiers; j_hier++) {
+            if (current_bcol ==
+                    schedule->component_functions[j_hier].
+                    constant_group_data.bcol_module) {
+                schedule->component_functions[j_hier].
+                    constant_group_data.index_of_this_type_in_collective = cnt;
+                cnt++;
+            }
+        }
+
+        schedule->component_functions[i_hier].
+            constant_group_data.n_of_this_type_in_collective = cnt;
+    }
+
+    MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);
+
+    free(scratch_num);
+    free(scratch_indx);
+
+    return OMPI_SUCCESS;
+
+Allreduce_Setup_Error:
+
+    if (NULL != scratch_indx) {
+        free(scratch_indx);
+    }
+
+    if (NULL != scratch_num) {
+        free(scratch_num);
+    }
+
+    if (NULL != schedule->component_functions) {
+        free(schedule->component_functions);
+    }
+
+    return ret;
+}
+
+int ml_coll_hier_allreduce_setup_new(mca_coll_ml_module_t *ml_module)
+{
+    /* Hierarchy Setup */
+    int ret;
+    int topo_index;
+    int alg;
+    mca_coll_ml_topology_t *topo_info = ml_module->topo_list;
+
+    alg = mca_coll_ml_component.coll_config[ML_ALLREDUCE][ML_SMALL_MSG].algorithm_id;
+    topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][alg];
+    if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) {
+        ML_ERROR(("No topology index or algorithm was defined"));
+        topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL;
+        return OMPI_ERROR;
+    }
+
+    ret = mca_coll_ml_build_allreduce_schedule(
+                    &ml_module->topo_list[topo_index],
+                    &ml_module->coll_ml_allreduce_functions[alg],
+                    SMALL_MSG_RANGE);
+
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_VERBOSE(10, ("Failed to setup Small Message Allreduce"));
+         return ret;
+       }
+
+    alg = mca_coll_ml_component.coll_config[ML_ALLREDUCE][ML_LARGE_MSG].algorithm_id;
+    topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][alg];
+    if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) {
+        ML_ERROR(("No topology index or algorithm was defined"));
+        topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL;
+        return OMPI_ERROR;
+    }
+
+    ret = mca_coll_ml_build_allreduce_schedule(
+                    &ml_module->topo_list[topo_index],
+                    &ml_module->coll_ml_allreduce_functions[alg],
+                    LARGE_MSG_RANGE);
+
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_VERBOSE(10, ("Failed to setup Large Message Allreduce"));
+         return ret;
+       }
+
+    if (true == mca_coll_ml_component.need_allreduce_support) {
+        topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE];
+        if (ML_UNDEFINED == topo_index) {
+            ML_ERROR(("No topology index was defined"));
+            topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL;
+            return OMPI_ERROR;
+        }
+
+        ret = mca_coll_ml_build_allreduce_schedule(
+                        &ml_module->topo_list[topo_index],
+                        &ml_module->coll_ml_allreduce_functions[ML_SMALL_DATA_EXTRA_TOPO_ALLREDUCE],
+                        SMALL_MSG_RANGE);
+
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            ML_VERBOSE(10, ("Failed to setup Extra Small Message Allreduce"));
+            return ret;
+        }
+
+        topo_index = ml_module->collectives_topology_map[ML_ALLREDUCE][ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE];
+        if (ML_UNDEFINED == topo_index) {
+            ML_ERROR(("No topology index was defined"));
+            topo_info->hierarchical_algorithms[ML_ALLREDUCE] = NULL;
+            return OMPI_ERROR;
+        }
+
+        ret = mca_coll_ml_build_allreduce_schedule(
+                        &ml_module->topo_list[topo_index],
+                        &ml_module->coll_ml_allreduce_functions[ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE],
+                        LARGE_MSG_RANGE);
+
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            ML_VERBOSE(10, ("Failed to setup Extra Large Message Allreduce"));
+            return ret;
+        }
+    }
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c
index 74f7b65803..4eaa760895 100755
--- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_barrier_setup.c
@@ -17,7 +17,9 @@
 
 static int mca_coll_ml_build_barrier_schedule(
                                     mca_coll_ml_topology_t *topo_info,
-                                    mca_coll_ml_collective_operation_description_t **coll_desc)
+                                    mca_coll_ml_collective_operation_description_t
+                                    **coll_desc,
+                                    mca_coll_ml_module_t *ml_module)
 {
     int i_hier, rc, i_fn, n_fcns, i,
         n_hiers = topo_info->n_levels;
@@ -52,6 +54,10 @@ static int mca_coll_ml_build_barrier_schedule(
         n_fcns = 2 * n_hiers;
     }
 
+    if( ml_module->max_fn_calls < n_fcns ) {
+        ml_module->max_fn_calls = n_fcns;
+    }
+
     /* Set dependencies equal to number of hierarchies */
     schedule->n_fns = n_fcns;
     schedule->topo_info = topo_info;
@@ -170,7 +176,7 @@ int ml_coll_hier_barrier_setup(mca_coll_ml_module_t *ml_module)
            &ml_module->topo_list[ml_module->collectives_topology_map[ML_BARRIER][ML_SMALL_MSG]];
 
     rc = mca_coll_ml_build_barrier_schedule(topo_info,
-                            &ml_module->coll_ml_barrier_function);
+                            &ml_module->coll_ml_barrier_function, ml_module);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
         /* Make sure to reset the barrier pointer to NULL */
         topo_info->hierarchical_algorithms[BCOL_BARRIER] = NULL;
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c
index ef79645006..e594528da6 100644
--- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_bcast_setup.c
@@ -409,12 +409,11 @@ static int mca_coll_ml_build_bcast_sequential_schedule_no_attributes(
         /*
         if(comp_fn->coll_fn_started){
             fprintf(stderr,"this statement is true\n");
-        } else{
+        } else {
             fprintf(stderr,"done setting to false \n");
         }
         */
        
-
         comp_fn->task_comp_fn = mca_coll_ml_task_comp_dynamic_root_small_message;
         /* assert(NULL != comp_fn->bcol_function); */
         /* Constants */
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_ibarrier.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_ibarrier.c
new file mode 100644
index 0000000000..0b654dbca3
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_ibarrier.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml.h"
+
+int ml_coll_hier_nonblocking_barrier_setup(mca_coll_ml_module_t *ml_module,
+        mca_coll_ml_topology_t *topo_info)
+{
+    /* local variables */
+    int ret=OMPI_SUCCESS;
+    /* int global_low_hierarchy_index=topo_info->global_lowest_hier_group_index; */
+    int global_high_hierarchy_index=topo_info->global_highest_hier_group_index;
+    int my_n_hierarchies=topo_info->n_levels;
+    int hier_loop_range,i,cnt;
+    mca_bcol_base_module_t *bcol_module;
+    /* collective algorithms */
+    coll_ml_collective_description_t **hierarchical_algorithms = topo_info->hierarchical_algorithms;
+
+    /* RLG:  one non-blocking barrier collective algorithm - this is really a hack,
+     * we need to figure out how to do this in a bit more extensible
+     * manner.
+     */
+     hierarchical_algorithms[BCOL_IBARRIER]= (coll_ml_collective_description_t *)
+         malloc(sizeof(coll_ml_collective_description_t));
+     if( NULL == hierarchical_algorithms[BCOL_IBARRIER]) {
+         ret=OMPI_ERROR;
+         goto Error;
+     }
+
+    /* am I a member of the highest level subgroup */
+    if( global_high_hierarchy_index ==
+          topo_info->component_pairs[my_n_hierarchies-1].bcol_index )
+    {
+        hier_loop_range=my_n_hierarchies-1;
+        /* allocate the function description */
+        hierarchical_algorithms[BCOL_IBARRIER][0].n_functions=2*my_n_hierarchies-1;
+        /* collective description */
+        hierarchical_algorithms[BCOL_IBARRIER][0].alg_params.coll_fn.
+            ibarrier_recursive_doubling.n_fanin_steps=my_n_hierarchies-1;
+        hierarchical_algorithms[BCOL_IBARRIER][0].alg_params.coll_fn.
+            ibarrier_recursive_doubling.n_fanout_steps=my_n_hierarchies-1;
+        hierarchical_algorithms[BCOL_IBARRIER][0].alg_params.coll_fn.
+            ibarrier_recursive_doubling.n_recursive_doubling_steps=1;
+    } else {
+        hier_loop_range=my_n_hierarchies;
+        /* allocate the function description */
+        hierarchical_algorithms[BCOL_IBARRIER][0].n_functions=2*my_n_hierarchies;
+        /* collective description */
+        hierarchical_algorithms[BCOL_IBARRIER][0].alg_params.coll_fn.
+            ibarrier_recursive_doubling.n_fanin_steps=my_n_hierarchies;
+        hierarchical_algorithms[BCOL_IBARRIER][0].alg_params.coll_fn.
+            ibarrier_recursive_doubling.n_fanout_steps=my_n_hierarchies;
+        hierarchical_algorithms[BCOL_IBARRIER][0].alg_params.coll_fn.
+            ibarrier_recursive_doubling.n_recursive_doubling_steps=0;
+    }
+
+    /* number of temp buffers */
+    hierarchical_algorithms[BCOL_IBARRIER]->n_buffers=0;
+
+    /* allocate space for the functions */
+    hierarchical_algorithms[BCOL_IBARRIER][0].functions=(coll_ml_function_t *)
+        malloc(sizeof(coll_ml_function_t)*
+            hierarchical_algorithms[BCOL_IBARRIER][0].n_functions);
+    if( NULL == hierarchical_algorithms[BCOL_IBARRIER][0].functions) {
+        ret=OMPI_ERROR;
+        goto Error;
+    }
+
+
+    /*
+     * The algorithm used here for an N level system
+     *  - up to level N-2, inclusive : fan in
+     *  - level N-1: barrier
+     *  - level N-2, to level 0: fanout
+     */
+
+    cnt=0;
+    /* fan-in phase */
+    for(i=0 ; i < hier_loop_range ; i++ ) {
+        bcol_module=topo_info->component_pairs[i].bcol_modules[0];
+        hierarchical_algorithms[BCOL_IBARRIER][0].functions[cnt].fn_idx = BCOL_IFANIN;
+/* RLG NOTE - is this a bug ?  We do not set the bcol module pointer in the functions array ... */
+        cnt++;
+    }
+
+
+    /* barrier */
+    if( hier_loop_range != my_n_hierarchies ) {
+        bcol_module=topo_info->component_pairs[my_n_hierarchies-1].bcol_modules[0];
+        hierarchical_algorithms[BCOL_IBARRIER][0].functions[cnt].fn_idx = BCOL_IBARRIER;
+        hierarchical_algorithms[BCOL_IBARRIER][0].functions[cnt].bcol_module = bcol_module;
+        cnt++;
+    }
+
+    /* fan-out */
+    for( i=hier_loop_range-1 ; i >=0 ; i-- ) {
+        bcol_module=topo_info->component_pairs[i].bcol_modules[0];
+        hierarchical_algorithms[BCOL_IBARRIER][0].functions[cnt].fn_idx = BCOL_IFANOUT;
+        cnt++;
+    }
+
+    /* set the number of functions pointers needed by this routine - this is
+     * used to figure out how large to allocate the function argument
+     * array.
+     */
+    if( ml_module->max_fn_calls < cnt ) {
+        ml_module->max_fn_calls=cnt;
+    }
+
+    /* set algorithm parameters */
+    hierarchical_algorithms[BCOL_IBARRIER][0].
+        alg_params.coll_fn.ibarrier_recursive_doubling.n_fanin_steps=hier_loop_range;
+    hierarchical_algorithms[BCOL_IBARRIER][0].
+        alg_params.coll_fn.ibarrier_recursive_doubling.n_fanout_steps=hier_loop_range;
+    if( hier_loop_range != my_n_hierarchies ) {
+        hierarchical_algorithms[BCOL_IBARRIER][0].
+            alg_params.coll_fn.ibarrier_recursive_doubling.n_recursive_doubling_steps=1;
+    } else {
+        hierarchical_algorithms[BCOL_IBARRIER][0].
+            alg_params.coll_fn.ibarrier_recursive_doubling.n_recursive_doubling_steps=0;
+    }
+
+
+    /* done */
+    return ret;
+
+Error :
+
+    if( hierarchical_algorithms[BCOL_IBARRIER][0].functions) {
+       free(hierarchical_algorithms[BCOL_IBARRIER][0].functions);
+       hierarchical_algorithms[BCOL_IBARRIER][0].functions=NULL;
+    }
+
+    if( hierarchical_algorithms[BCOL_IBARRIER]) {
+       free(hierarchical_algorithms[BCOL_IBARRIER]);
+       hierarchical_algorithms[BCOL_IBARRIER]=NULL;
+    }
+
+    return ret;
+}
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c
new file mode 100644
index 0000000000..f40e2879e5
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_reduce_setup.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/include/ompi/constants.h"
+#include "ompi/mca/coll/ml/coll_ml_functions.h"
+static int mca_coll_ml_task_comp_static_reduce
+    (struct mca_coll_ml_task_status_t *task) {
+
+    task->ml_coll_operation->variable_fn_params.root_flag = true;
+
+    return OMPI_SUCCESS;
+}
+
+static void mca_coll_ml_static_reduce_non_root(mca_coll_ml_task_status_t *task_status, int index,
+        mca_coll_ml_compound_functions_t *func)
+{
+    /* I am not a root rank, but someone in my group is a root*/
+    if (task_status->ml_coll_operation->variable_fn_params.root_route->level == index) {
+        task_status->rt_num_dependencies = func->num_dependencies;
+        task_status->rt_num_dependent_tasks = 0;
+        task_status->rt_dependent_task_indecies = NULL;
+        task_status->ml_coll_operation->variable_fn_params.root =
+                                task_status->ml_coll_operation->variable_fn_params.root_route->rank;
+    } else {
+        task_status->rt_num_dependencies = 0;
+        task_status->rt_num_dependent_tasks = 1;
+        task_status->rt_dependent_task_indecies = task_status->ml_coll_operation->variable_fn_params.root_route->level;
+    }
+
+}
+
+static void mca_coll_ml_static_reduce_root(mca_coll_ml_task_status_t *task_status, int index,
+        mca_coll_ml_compound_functions_t *func)
+{
+        task_status->rt_num_dependencies = func->num_dependencies;
+        task_status->rt_num_dependent_tasks = 0;
+        task_status->rt_dependent_task_indecies = NULL;
+}
+
+/*
+ * Fill up the collective descriptor
+ *
+ */
+static int mca_coll_ml_build_static_reduce_schedule(
+                                    mca_coll_ml_topology_t *topo_info,
+                                    mca_coll_ml_collective_operation_description_t **coll_desc)
+{
+    int i_hier, j_hier,  n_fcns,
+        n_hiers = topo_info->n_levels;
+    int *scratch_indx = NULL,
+        *scratch_num = NULL;
+    int cnt, value_to_set = 0;
+    int ret = OMPI_SUCCESS;
+    bool prev_is_zero;
+    mca_coll_ml_compound_functions_t *comp_fns_temp;
+    mca_bcol_base_module_t *prev_bcol,
+                           *bcol_module;
+    mca_coll_ml_compound_functions_t *comp_fn;
+    mca_coll_ml_collective_operation_description_t  *schedule = NULL;
+
+    *coll_desc = (mca_coll_ml_collective_operation_description_t *)
+                  malloc(sizeof(mca_coll_ml_collective_operation_description_t));
+
+    schedule = *coll_desc;
+    if (OPAL_UNLIKELY(NULL == schedule)) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Error;
+    }
+
+    scratch_indx = (int *) malloc(sizeof(int) * (n_hiers));
+    if (NULL == scratch_indx) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Error;
+    }
+
+    scratch_num = (int *) malloc(sizeof(int) * (n_hiers));
+    if (NULL == scratch_num) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Error;
+    }
+
+    prev_bcol = NULL;
+
+    /* Calculate scratch numbers */
+    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
+        if (IS_BCOL_TYPE_IDENTICAL(prev_bcol, GET_BCOL(topo_info, i_hier))) {
+            scratch_indx[i_hier] = scratch_indx[i_hier - 1] + 1;
+        } else {
+            scratch_indx[i_hier] = 0;
+            prev_bcol = GET_BCOL(topo_info, i_hier);
+        }
+    }
+
+    --i_hier;
+    prev_is_zero = true;
+
+    do {
+        if (prev_is_zero) {
+            value_to_set = scratch_indx[i_hier] + 1;
+            prev_is_zero = false;
+        }
+
+        if (0 == scratch_indx[i_hier]) {
+            prev_is_zero = true;
+        }
+
+        scratch_num[i_hier] = value_to_set;
+        --i_hier;
+    } while(i_hier >= 0);
+
+    /* All hierarchies call one function, unlike other collectives */
+    n_fcns = n_hiers;
+
+    /* Set dependencies equal to number of hierarchies */
+    schedule->n_fns = n_fcns;
+    schedule->topo_info = topo_info;
+    schedule->progress_type = 0;
+    /* Allocated the component function */
+    schedule->component_functions = (struct mca_coll_ml_compound_functions_t *)
+                                     calloc(n_fcns, sizeof(struct mca_coll_ml_compound_functions_t));
+
+    if (OPAL_UNLIKELY(NULL == schedule->component_functions)) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Error;
+    }
+
+
+    for (i_hier = 0; i_hier < n_hiers; ++i_hier) {
+        comp_fn = &schedule->component_functions[i_hier];
+
+        /* The hierarchial level */
+        comp_fn->h_level = i_hier;
+        bcol_module = GET_BCOL(topo_info, i_hier);
+
+        comp_fn->bcol_function =
+                bcol_module->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING][BCOL_REDUCE][1][0][0];
+
+        strcpy(comp_fn->fn_name, "REDUCE");
+        ML_VERBOSE(10, ("func indx %d set to %p", i_hier, comp_fn->bcol_function));
+
+
+        ML_VERBOSE(1,("In ML_REDUCE_SETUP  .. looks fine here"));
+        /* No need completion func for Barrier */
+        comp_fn->task_comp_fn = mca_coll_ml_task_comp_static_reduce;
+
+        /* Constants */
+        comp_fn->constant_group_data.bcol_module = bcol_module;
+        comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls = scratch_indx[i_hier];
+        comp_fn->constant_group_data.n_of_this_type_in_a_row = scratch_num[i_hier];
+        comp_fn->constant_group_data.n_of_this_type_in_collective = 0;
+        comp_fn->constant_group_data.index_of_this_type_in_collective = 0;
+
+        ML_VERBOSE(10, ("Setting collective [reduce] fn_idx %d, n_of_this_type_in_a_row %d, "
+                        "index_in_consecutive_same_bcol_calls %d.",
+                         i_hier, comp_fn->constant_group_data.n_of_this_type_in_a_row,
+                         comp_fn->constant_group_data.index_in_consecutive_same_bcol_calls));
+    }
+
+
+    /* Fill the rest of constant data */
+    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
+        mca_bcol_base_module_t *current_bcol =
+            schedule->component_functions[i_hier].
+            constant_group_data.bcol_module;
+        cnt = 0;
+        for (j_hier = 0; j_hier < n_hiers; j_hier++) {
+            if (current_bcol ==
+                    schedule->component_functions[j_hier].
+                    constant_group_data.bcol_module) {
+                schedule->component_functions[j_hier].
+                    constant_group_data.index_of_this_type_in_collective = cnt;
+                cnt++;
+            }
+        }
+        schedule->component_functions[i_hier].
+            constant_group_data.n_of_this_type_in_collective = cnt;
+    }
+
+    /* Manju: Reduction should always use the fixed schedule.
+     * The subgroups that this process is leader should be executed first, then
+     * it should execute the subgroups where this process is not a leader, and
+     * then execute the subgroup that includes the root.
+     */
+
+    /* Allocate the schedule list */
+    schedule->comp_fn_arr = (struct mca_coll_ml_compound_functions_t **)
+        calloc(n_hiers,sizeof(struct mca_coll_ml_compound_functions_t *));
+    if (NULL == schedule->comp_fn_arr) {
+        ML_ERROR(("Can't allocate memory.\n"));
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto Error;
+    }
+
+    /* Now that the functions have been set-up properly, we can simple permute the ordering a bit */
+
+    for (i_hier = 0; i_hier < n_hiers; i_hier++) {
+        /* first one is trivial */
+        int leader_hierarchy = 0;
+        int non_leader_hierarchy = 0;
+        int func_index;
+
+        comp_fns_temp = (struct mca_coll_ml_compound_functions_t *)
+            calloc(n_hiers, sizeof(struct mca_coll_ml_compound_functions_t));
+
+        leader_hierarchy = 0;
+        non_leader_hierarchy = n_hiers - 2;
+
+        for(j_hier = 0; j_hier < n_hiers - 1 ; j_hier++) {
+
+            func_index = j_hier < i_hier ? j_hier : j_hier + 1;
+            /* I'm a leader for this group */
+            if (0 == topo_info->component_pairs->subgroup_module->my_index) {
+                comp_fns_temp[leader_hierarchy++] =
+                    schedule->component_functions[func_index];
+            }
+            else {
+                comp_fns_temp[non_leader_hierarchy--] =
+                    schedule->component_functions[func_index];
+            }
+        }
+
+        comp_fns_temp[j_hier] = schedule->component_functions[i_hier];
+        /* now let's attach this list to our array of lists */
+        schedule->comp_fn_arr[i_hier] = comp_fns_temp;
+    }
+
+    /* Manju: Do we need this ? */
+
+    /* I'm going to just loop over each schedule and
+     * set up the scratch indices, scratch numbers
+     * and other constant data
+     */
+    /*
+    for( i_hier = 1; i_hier < n_hiers; i_hier++) {
+        ret = mca_coll_ml_setup_scratch_vals(schedule->comp_fn_arr[i_hier], scratch_indx,
+                scratch_num, n_hiers);
+        if( OMPI_SUCCESS != ret ) {
+            ret = OMPI_ERROR;
+            goto Error;
+        }
+
+    }
+    */
+
+    /* Do I need this ? */
+    schedule->task_setup_fn[COLL_ML_ROOT_TASK_FN] = mca_coll_ml_static_reduce_root;
+    schedule->task_setup_fn[COLL_ML_GENERAL_TASK_FN] = mca_coll_ml_static_reduce_non_root;
+
+    MCA_COLL_ML_SET_SCHEDULE_ORDER_INFO(schedule);
+
+    free(scratch_num);
+    free(scratch_indx);
+
+    return OMPI_SUCCESS;
+
+Error:
+    if (NULL != schedule->component_functions) {
+        free(schedule->component_functions);
+        schedule->component_functions = NULL;
+    }
+
+    return ret;
+}
+
+
+int ml_coll_hier_reduce_setup(mca_coll_ml_module_t *ml_module)
+{
+    int alg, ret, topo_index=0;
+    mca_coll_ml_topology_t *topo_info =
+           &ml_module->topo_list[ml_module->collectives_topology_map[ML_REDUCE][ML_SMALL_MSG]];
+
+    if ( ml_module->max_fn_calls < topo_info->n_levels ) {
+        ml_module->max_fn_calls = topo_info->n_levels;
+    }
+
+
+    alg = mca_coll_ml_component.coll_config[ML_REDUCE][ML_SMALL_MSG].algorithm_id;
+    topo_index = ml_module->collectives_topology_map[ML_REDUCE][alg];
+    if (ML_UNDEFINED == alg || ML_UNDEFINED == topo_index) {
+        ML_ERROR(("No topology index or algorithm was defined"));
+        topo_info->hierarchical_algorithms[ML_REDUCE] = NULL;
+        return OMPI_ERROR;
+    }
+
+    ret = mca_coll_ml_build_static_reduce_schedule(&ml_module->topo_list[topo_index],
+            &ml_module->coll_ml_reduce_functions[alg]);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_VERBOSE(10, ("Failed to setup static reduce"));
+        return ret;
+    }
+
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c b/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c
index 7f1af65bab..2a63d5e130 100644
--- a/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c
+++ b/ompi/mca/coll/ml/coll_ml_hier_algorithms_setup.c
@@ -300,7 +300,10 @@ int ml_coll_hier_allreduce_setup(mca_coll_ml_module_t *ml_module)
     return ret;
 }
 
-
+#if 0
+/*
+ * Manju: New setup function in coll_ml_hier_algorithms_reduce_setup.c
+ */
 /* Ishai: Reduce is not an hier algorithm (it is rooted) - it needs a different ML algorithm */
 /* Need to rewrite */
 int ml_coll_hier_reduce_setup(mca_coll_ml_module_t *ml_module)
@@ -320,6 +323,7 @@ int ml_coll_hier_reduce_setup(mca_coll_ml_module_t *ml_module)
     ml_module->topo_list[topo_index].hierarchical_algorithms[BCOL_BCAST] = NULL;
     return ret;
 }
+#endif
 
 int ml_coll_barrier_constant_group_data_setup(
                 mca_coll_ml_topology_t *topo_info,
diff --git a/ompi/mca/coll/ml/coll_ml_ibarrier.c b/ompi/mca/coll/ml/coll_ml_ibarrier.c
new file mode 100644
index 0000000000..12fe3913d8
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_ibarrier.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/** @file */
+
+#include "ompi_config.h"
+
+#include "ompi/constants.h"
+#include "opal/threads/mutex.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "opal/sys/atomic.h"
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/mca/coll/ml/coll_ml_hier_algorithms.h"
+
+    coll_ml_collective_description_t *collective_op;
+    bcol_fn_args_t fn_arguments;
+    mca_coll_ml_descriptor_t *msg_desc;
+
+    static int coll_ml_setup_ibarrier_instance_recursive_doubling(
+            mca_coll_ml_descriptor_t *msg_desc,
+            coll_ml_collective_description_t *collective_op)
+{
+    /* local variables */
+    ret=OMPI_SUCCESS;
+    int i_fn,cnt;
+
+    /* initialize function arguments */
+
+    /* mark all routines as not yet started - need this, so that
+     * when we try to progress the barrier, we know where to pickup
+     * when a function is called - MOVE this into the setup function.
+     */
+    for(i_fn=0 ; i_fn < collective_op->n_functions ; i_fn++ ) {
+        msg_desc->fragment.fn_args[i_fn].function_status=FUNCTION_NOT_STARTED;
+    }
+
+    /* setup the fanin root */
+    for(i_fn=0 ; i_fn < collective_op->
+        alg_params.coll_fn.ibarrier_recursive_doubling.n_fanin_steps ;
+        i_fn++ ) {
+        mca_bcol_base_module_t *bcol_mod=
+           msg_desc->local_comm_description->functions[i_fn].bcol_module;
+        /* the lowest rank in the group will be the root */
+        msg_desc->fragment.fn_args[i_fn].root=0;
+    }
+
+    /* setup the fanout root */
+    cnt=alg_params.coll_fn.ibarrier_recursive_doubling.n_fanin_steps+
+        alg_params.coll_fn.ibarrier_recursive_doubling.n_recursive_doubling_steps;
+    for(i_fn=cnt ; i_fn < cnt + collective_op->
+        alg_params.coll_fn.ibarrier_recursive_doubling.n_fanin_steps ;
+        i_fn++ ) {
+        mca_bcol_base_module_t *bcol_mod=
+           msg_desc->local_comm_description->functions[i_fn].bcol_module;
+        /* the lowest rank in the group will be the root */
+        msg_desc->fragment.fn_args[i_fn].root=0;
+    }
+
+    /* successful completion */
+    return ret;
+}
+
+/**
+ * Hierarchical blocking barrier
+ */
+int mca_coll_ml_nb_barrier_intra( struct ompi_communicator_t *comm,
+        ompi_request_t ** request, mca_coll_base_module_t *module)
+{
+    /* local variables */
+    int ret=OMPI_SUCCESS;
+    mca_coll_ml_module_t *ml_module;
+    uint64_t sequence_number;
+    int i_fn;
+    coll_ml_collective_description_t *collective_op;
+    bcol_fn_args_t fn_arguments;
+    mca_coll_ml_descriptor_t *msg_desc;
+
+    ml_module=(mca_coll_ml_module_t *) module;
+/* debug */
+fprintf(stderr," mca_coll_ml_nb_barrier_intra called \n");
+fflush(stderr);
+/* end debug */
+
+    /* grab full message descriptor - RLG: this is really not optimal,
+     * as we may be doing too much initialization if the collective
+     * routine completes on the first call to progress which is called
+     * within this routine.  Need to see if we can be more efficient
+     * here.  The current issue is that the only way that I can think
+     * to do this now is with two separate code paths, which I want to
+     * avoid at this stage.
+     */
+    OMPI_FREE_LIST_GET(&(ml_module->message_descriptors),
+       msg_desc,ret);
+    if( OMPI_SUCCESS != ret) {
+        goto Error;
+    }
+
+    /* get message sequence number */
+    sequence_number=OPAL_THREAD_ADD64(
+            &(ml_module->no_data_collective_sequence_num),1);
+    fn_arguments.sequence_num=sequence_number;
+
+
+    /* get pointer to schedule - only one algorithm at this stage */
+    collective_op=&(ml_module->hierarchical_algorithms[BCOL_NB_BARRIER][0]);
+
+    /* call setup function - RLG: right now this is totally extra,
+     * but if we are going to have more than one algorithm,
+     * this is a better way to do this. */
+    coll_ml_setup_ibarrier_instance_recursive_doubling(
+            msg_desc,collective_op);
+
+    /* call the progress function to actually start the barrier */
+
+    /* recycle resources - RLG: need to think about this one */
+
+#if 0
+    /* run barrier */
+    /* need to add bcol context for the call */
+    for( i_fn =0 ; i_fn < collective_op->n_functions ; i_fn++ ) {
+        mca_bcol_base_module_t *bcol_module=
+            collective_op->functions[i_fn].bcol_module;
+        /* for barrier, all we need is the group information that is
+         * captured in the bcol module
+         */
+        ret=collective_op->functions[i_fn].fn(&fn_arguments,
+                NULL,NULL,bcol_module);
+        if( OMPI_SUCCESS != ret) {
+        } goto Error;
+    }
+#endif
+
+    return OMPI_SUCCESS;
+
+Error:
+    return ret;
+}
diff --git a/ompi/mca/coll/ml/coll_ml_inlines.h b/ompi/mca/coll/ml/coll_ml_inlines.h
index fc3733a89a..4f1e3a189c 100644
--- a/ompi/mca/coll/ml/coll_ml_inlines.h
+++ b/ompi/mca/coll/ml/coll_ml_inlines.h
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
@@ -19,17 +20,6 @@
 
 BEGIN_C_DECLS
 
-static inline int mca_coll_ml_err(const char* fmt, ...)
-{
-    va_list list;
-    int ret;
-
-    va_start(list, fmt);
-    ret = vfprintf(stderr, fmt, list);
-    va_end(list);
-    return ret;
-}
-
 static inline __opal_attribute_always_inline__ int ml_fls(int num)
 {
     int i = 1;
@@ -186,15 +176,23 @@ static inline  __opal_attribute_always_inline__ int coll_ml_fragment_completion_
          * the MPI level request object, wich is the first element
          * in the fragment descriptor.
          */
+         /* I contend that this is a bug. This is not the right way to check
+             * for the first fragment as it assumes that the first fragment would always
+             * for any collective have zero as the first offset or that other subsequent
+             * fragments would not. It is not safe to assume this. The correct check is
+             * the following one
+             */
+
         ML_VERBOSE(10, ("Master ? %p %d", coll_op,  coll_op->fragment_data.offset_into_user_buffer));
-        /* offset_into_user_buffer == 0 is not a valid definition for the first frag. 
-         * if (0 != coll_op->fragment_data.offset_into_user_buffer &&
-         *       !out_of_resource) {
-         * A well posed definition of the first frag is the following
+        /* This check is in fact a bug. Not the correct definiton of first
+         * fragment. First fragment is the only fragment that satisfies the
+         * following criteria
          */
-        if((&coll_op->full_message !=
-                    coll_op->fragment_data.message_descriptor) &&
-                !out_of_resource){
+        /*if (0 != coll_op->fragment_data.offset_into_user_buffer &&
+                !out_of_resource) {
+                */
+        if (((&coll_op->full_message != coll_op->fragment_data.message_descriptor) &&
+	     !out_of_resource) || IS_COLL_SYNCMEM(coll_op)) {
             /* non-zero offset ==> this is not fragment 0 */
             CHECK_AND_RECYCLE(coll_op);
         }
@@ -414,7 +412,7 @@ static inline __opal_attribute_always_inline__ int mca_coll_ml_generic_collectiv
     for (fn_index = 0; fn_index < op_desc->n_fns; fn_index++) {
         func = &op_desc->component_functions[fn_index];
         task_status = &op_prog->dag_description.status_array[fn_index];
-        /* fire the collective imidiate if it has no dependencies */
+        /* fire the collective immediately if it has no dependencies */
         if (0 == task_status->rt_num_dependencies) {
             rc = func->bcol_function->coll_fn(&op_prog->variable_fn_params,
                     /* Pasha: Need to update the prototype of the func,
@@ -436,7 +434,7 @@ static inline __opal_attribute_always_inline__ int mca_coll_ml_generic_collectiv
                     OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.active_tasks_mutex));
                     break;
                 case BCOL_FN_COMPLETE:
-                    /* the tast is done ! lets start relevant dependencies */
+                    /* the task is done ! lets start relevant dependencies */
                     ML_VERBOSE(9, ("Call to bcol collecitive return BCOL_FN_COMPLETE"));
                     /* the task does not belong to any list, yes. So passing NULL */
                     ret = mca_coll_ml_task_completion_processing(&task_status, NULL);
@@ -555,20 +553,24 @@ static inline __opal_attribute_always_inline__
                 void mca_coll_ml_convertor_get_send_frag_size(mca_coll_ml_module_t *ml_module,
                                      size_t *frag_size, struct full_message_t *message_descriptor)
 {
-    size_t ml_fragment_size = ml_module->ml_fragment_size;
+    size_t fragment_size = *frag_size;
     opal_convertor_t *dummy_convertor = &message_descriptor->dummy_convertor;
 
     /* The last frag needs special service */
-    if (ml_fragment_size >
-          message_descriptor->send_converter_bytes_packed) {
+    if (fragment_size >
+          (size_t) message_descriptor->send_converter_bytes_packed) {
         *frag_size = message_descriptor->send_converter_bytes_packed;
         message_descriptor->send_converter_bytes_packed = 0;
 
         return;
     }
-
-    *frag_size = ml_fragment_size;
-    message_descriptor->dummy_conv_position += ml_fragment_size;
+    if( (message_descriptor->dummy_conv_position + fragment_size) >
+            message_descriptor->n_bytes_total ) {
+        message_descriptor->dummy_conv_position = (message_descriptor->dummy_conv_position + fragment_size)
+            - message_descriptor->n_bytes_total;
+    } else {
+        message_descriptor->dummy_conv_position += fragment_size;
+    }
 
     opal_convertor_generic_simple_position(dummy_convertor, &message_descriptor->dummy_conv_position);
     *frag_size -= dummy_convertor->partial_length;
@@ -576,6 +578,60 @@ static inline __opal_attribute_always_inline__
     message_descriptor->send_converter_bytes_packed -= (*frag_size);
 }
 
+static inline __opal_attribute_always_inline__ int
+mca_coll_ml_launch_sequential_collective (mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    mca_bcol_base_coll_fn_desc_t *bcol_func;
+    int ifunc, n_fn, ih, ret;
+    mca_coll_ml_collective_operation_description_t *sched =
+        coll_op->coll_schedule;
+
+    n_fn = sched->n_fns;
+    ih = coll_op->sequential_routine.current_active_bcol_fn;
+
+    /* if collectives are already pending just add this one to the list */
+    if (opal_list_get_size (&mca_coll_ml_component.sequential_collectives)) {
+        opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *) coll_op);
+
+        return OMPI_SUCCESS;
+    }
+
+    for (ifunc = ih; ifunc < n_fn; ifunc++, coll_op->sequential_routine.current_active_bcol_fn++) {
+        ret = coll_op->sequential_routine.seq_task_setup(coll_op);
+        if (OMPI_SUCCESS != ret) {
+            return ret;
+        }
+
+        bcol_func = (sched->component_functions[ifunc].bcol_function);
+        ret = bcol_func->coll_fn(&coll_op->variable_fn_params,
+                    (struct coll_ml_function_t *) &sched->component_functions[ifunc].constant_group_data);
+
+        if (BCOL_FN_COMPLETE == ret) {
+            if (ifunc == n_fn - 1) {
+                ret = coll_ml_fragment_completion_processing(coll_op);
+                if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+                    mca_coll_ml_abort_ml("Failed to run coll_ml_fragment_completion_processing");
+                }
+
+                return OMPI_SUCCESS;
+            }
+        } else {
+            if (BCOL_FN_STARTED == ret) {
+                coll_op->sequential_routine.current_bcol_status = SEQ_TASK_IN_PROG;
+            } else {
+                coll_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
+            }
+
+            ML_VERBOSE(10, ("Adding pending bcol to the progress list to access by ml_progress func-id %d", ifunc));
+            opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *) coll_op);
+
+            break;
+        }
+    }
+
+    return OMPI_SUCCESS;
+}
+
 END_C_DECLS
 
 #endif
diff --git a/ompi/mca/coll/ml/coll_ml_mca.c b/ompi/mca/coll/ml/coll_ml_mca.c
index 13d6dd370e..0ee75e569c 100644
--- a/ompi/mca/coll/ml/coll_ml_mca.c
+++ b/ompi/mca/coll/ml/coll_ml_mca.c
@@ -54,6 +54,13 @@ mca_base_var_enum_value_t fragmentation_enable_enum[] = {
     {-1, NULL}
 };
 
+mca_base_var_enum_value_t bcast_algorithms[] = {
+    {COLL_ML_STATIC_BCAST, "static"},
+    {COLL_ML_SEQ_BCAST, "sequential"},
+    {COLL_ML_UNKNOWN_BCAST, "unknown-root"},
+    {-1, NULL}
+};
+
 /*
  * utility routine for string parameter registration
  */
@@ -197,85 +204,76 @@ int mca_coll_ml_register_params(void)
 
     /* register openib component parameters */
 
-    CHECK(reg_int("priority", NULL,
-                  "ML component priority"
-                  "(from 0(low) to 90 (high))", 0, &mca_coll_ml_component.ml_priority, 0));
+    CHECK(reg_int("priority", NULL, "ML component priority"
+                  "(from 0(low) to 90 (high))", 27, &mca_coll_ml_component.ml_priority, 0));
 
-    CHECK(reg_int("verbose", NULL,
-                  "Output some verbose ML information "
+    CHECK(reg_int("verbose", NULL, "Output some verbose ML information "
                   "(0 = no output, nonzero = output)", 0, &mca_coll_ml_component.verbose, 0));
 
-    CHECK(reg_int("n_levels", NULL,
-                  "number of levels in the hierarchy ", 1, &mca_coll_ml_component.ml_n_levels, 0));
+    CHECK(reg_int("max_comm", NULL, "Maximum number of communicators that can use coll/ml", 24,
+                  (int *) &mca_coll_ml_component.max_comm, 0));
 
-    CHECK(reg_int("max_comm", NULL,
-                  "max of communicators available to run ML", 12, (int *) &mca_coll_ml_component.max_comm, 0));
+    CHECK(reg_int("min_comm_size", NULL, "Minimum size of communicator to use coll/ml", 0,
+                  &mca_coll_ml_component.min_comm_size, 0));
 
-    CHECK(reg_int("min_comm_size", NULL,
-                  " min size of comm to be available to run ML", 0, &mca_coll_ml_component.min_comm_size, 0));
+    CHECK(reg_int("n_payload_mem_banks", NULL, "Number of payload memory banks", 2,
+                  &mca_coll_ml_component.n_payload_mem_banks, 0));
 
-    CHECK(reg_int("n_payload_mem_banks", NULL,
-                "number of payload memory banks", 2, &mca_coll_ml_component.n_payload_mem_banks, 0));
-
-    CHECK(reg_int("n_payload_buffs_per_bank", NULL,
-                "number of payload buffers per bank", 16, &mca_coll_ml_component.n_payload_buffs_per_bank, 0));
+    CHECK(reg_int("n_payload_buffs_per_bank", NULL, "Number of payload buffers per bank", 16,
+                  &mca_coll_ml_component.n_payload_buffs_per_bank, 0));
 
     /* RLG: need to handle alignment and size */
-    CHECK(reg_ullint("payload_buffer_size", NULL,
-                  "size of payload buffer", 4*1024, &mca_coll_ml_component.payload_buffer_size, 0));
+    CHECK(reg_ullint("payload_buffer_size", NULL, "Size of payload buffers", 4*1024,
+                     &mca_coll_ml_component.payload_buffer_size, 0));
 
     /* get the pipeline depth, default is 2 */
-    CHECK(reg_int("pipeline_depth", NULL,
-                  "size of fragmentation pipeline", 2, &mca_coll_ml_component.pipeline_depth, 0));
+    CHECK(reg_int("pipeline_depth", NULL, "Size of fragmentation pipeline", 2,
+                  &mca_coll_ml_component.pipeline_depth, 0));
 
-    CHECK(reg_int("free_list_init_size", NULL,
-                  " Initial size for free lists in ML", 128, &mca_coll_ml_component.free_list_init_size, 0));
+    CHECK(reg_int("free_list_init_size", NULL, "Initial size of free lists in coll/ml", 128,
+                  &mca_coll_ml_component.free_list_init_size, 0));
 
-    CHECK(reg_int("free_list_grow_size", NULL,
-                  " Initial size for free lists in ML", 64, &mca_coll_ml_component.free_list_grow_size, 0));
+    CHECK(reg_int("free_list_grow_size", NULL, "Initial size of free lists in coll/ml", 64,
+                  &mca_coll_ml_component.free_list_grow_size, 0));
 
-    CHECK(reg_int("free_list_max_size", NULL,
-                  " Initial size for free lists in ML", -1, &mca_coll_ml_component.free_list_max_size, 0));
+    CHECK(reg_int("free_list_max_size", NULL, "Initial size of free lists in coll/ml", -1,
+                  &mca_coll_ml_component.free_list_max_size, 0));
 
-    CHECK(reg_int("use_knomial_allreduce", NULL,
-                "Use k-nomial Allreduce supports only p2p currently"
-                , 1, &mca_coll_ml_component.use_knomial_allreduce, 0));
+    mca_coll_ml_component.use_knomial_allreduce = 1;
 
-    CHECK(reg_bool("use_static_bcast", NULL,
-                "Use new bcast static algorithm", true, &mca_coll_ml_component.use_static_bcast));
+    tmp = mca_base_var_enum_create ("coll_ml_bcast_algorithm", bcast_algorithms, &new_enum);
+    if (OPAL_SUCCESS != tmp) {
+        return tmp;
+    }
 
-    CHECK(reg_bool("use_sequential_bcast", NULL,
-                  "Use new bcast static algorithm", false, &mca_coll_ml_component.use_sequential_bcast));
+    mca_coll_ml_component.bcast_algorithm = COLL_ML_STATIC_BCAST;
+    tmp = mca_base_component_var_register (&mca_coll_ml_component.super.collm_version, "bcast_algorithm",
+                                           "Algorithm to use for broadcast", MCA_BASE_VAR_TYPE_INT,
+                                           new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_coll_ml_component.bcast_algorithm);
 
-    CHECK(reg_bool("disable_allgather", NULL,
-                  "Allgather disabling",
-                   false, &mca_coll_ml_component.disable_allgather));
+    CHECK(reg_bool("disable_allgather", NULL, "Disable Allgather", false,
+                   &mca_coll_ml_component.disable_allgather));
 
-    CHECK(reg_bool("disable_alltoall", NULL,
-                  "Alltoall disabling",
-                   false, &mca_coll_ml_component.disable_alltoall));
+    CHECK(reg_bool("disable_reduce", NULL, "Disable Reduce", false,
+                   &mca_coll_ml_component.disable_reduce));
 
     tmp = mca_base_var_enum_create ("coll_ml_enable_fragmentation_enum", fragmentation_enable_enum, &new_enum);
-    if (OPAL_SUCCESS != ret) {
-	return tmp;
+    if (OPAL_SUCCESS != tmp) {
+        return tmp;
     }
 
     /* default to auto-enable fragmentation */
     mca_coll_ml_component.enable_fragmentation = 2;
     tmp = mca_base_component_var_register (&mca_coll_ml_component.super.collm_version, "enable_fragmentation",
-					   "Disable/Enable fragmentation for large messages", MCA_BASE_VAR_TYPE_INT,
-					   new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
-					   &mca_coll_ml_component.enable_fragmentation);
+                                           "Disable/Enable fragmentation for large messages", MCA_BASE_VAR_TYPE_INT,
+                                           new_enum, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_coll_ml_component.enable_fragmentation);
     if (0 > tmp) {
         ret = tmp;
     }
     OBJ_RELEASE(new_enum);
 
-    CHECK(reg_int("use_brucks_smsg_alltoall", NULL,
-                "Use Bruck's Algo for Small Msg Alltoall"
-                "1 - Bruck's Algo with RDMA; 2 - Bruck's with Send Recv"
-                , 0, &mca_coll_ml_component.use_brucks_smsg_alltoall, 0));
-
     asprintf(&str, "%s/mca-coll-ml.config",
             opal_install_dirs.ompidatadir);
     if (NULL == str) {
diff --git a/ompi/mca/coll/ml/coll_ml_memsync.c b/ompi/mca/coll/ml/coll_ml_memsync.c
index 65b9fb6ea8..cc79924088 100644
--- a/ompi/mca/coll/ml/coll_ml_memsync.c
+++ b/ompi/mca/coll/ml/coll_ml_memsync.c
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -35,7 +38,7 @@ static int mca_coll_ml_memsync_recycle_memory(mca_coll_ml_collective_operation_p
            ML_MEMSYNC == coll_op->fragment_data.current_coll_op);
 
     ML_VERBOSE(10,("MEMSYNC: bank %d was recycled coll_op %p", bank, coll_op));
-    
+
     /* set the bank as free */
 
     ml_memblock->bank_is_busy[bank] = false;
@@ -61,11 +64,11 @@ static int mca_coll_ml_memsync_recycle_memory(mca_coll_ml_collective_operation_p
                 }
                 break;
             case OMPI_ERR_TEMP_OUT_OF_RESOURCE: 
-                ML_VERBOSE(10, ("Already on hte list %p", pending_op));
+                ML_VERBOSE(10, ("Already on the list %p", pending_op));
                 have_resources = false;
                 break;
             default:
-                ML_ERROR(("Error happend %d", rc));
+                ML_ERROR(("Error happened %d", rc));
                 return rc;
         }
     }
@@ -133,8 +136,6 @@ int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *ml_module, int bank_index)
 
     ML_VERBOSE(8, ("MEMSYNC start"));
 
-    OBJ_RETAIN(ml_module->comm);
-
     if (OPAL_UNLIKELY(0 == opal_list_get_size(&ml_module->active_bcols_list))) {
         /* Josh's change: In the case where only p2p is active, we have no way
          * to reset the bank release counters to zero, I am doing that here since it
@@ -147,12 +148,11 @@ int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *ml_module, int bank_index)
          * ptp case. 
          */
         mca_coll_ml_collective_operation_progress_t dummy_coll;
+
         dummy_coll.coll_module = (mca_coll_base_module_t *) ml_module;
         dummy_coll.fragment_data.current_coll_op = ML_MEMSYNC;
         dummy_coll.full_message.bank_index_to_recycle = bank_index;
-        dummy_coll.fragment_data.offset_into_user_buffer = 100; /* must be non-zero, 
-                                                                 * else assert fails in recycle flow 
-                                                                 */
+
         /* Handling special case when memory syncronization is not required */
         rc = mca_coll_ml_memsync_recycle_memory(&dummy_coll);
         if(OPAL_UNLIKELY(rc != OMPI_SUCCESS)){
@@ -160,6 +160,9 @@ int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *ml_module, int bank_index)
             return rc;
         } 
     } else {
+        /* retain the communicator until the operation is finished. the communicator
+         * will be released by CHECK_AND_RECYCLE */
+        OBJ_RETAIN(ml_module->comm);
 
         rc = mca_coll_ml_memsync_launch(ml_module, &req, bank_index);
         if (OPAL_UNLIKELY(rc != OMPI_SUCCESS)) {
diff --git a/ompi/mca/coll/ml/coll_ml_module.c b/ompi/mca/coll/ml/coll_ml_module.c
index 40353561eb..7eadc77221 100644
--- a/ompi/mca/coll/ml/coll_ml_module.c
+++ b/ompi/mca/coll/ml/coll_ml_module.c
@@ -1,9 +1,10 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
- * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2013 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
- * Copyright (c) 2012      Los Alamos National Security, LLC.
- *                         All rights reserved.
- * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -46,6 +47,8 @@
 #include "coll_ml_custom_utils.h"
 #include "coll_ml_allocation.h"
 
+static int coll_ml_parse_topology (sub_group_params_t *sub_group_meta_data, size_t sub_group_count,
+                                   int *list_of_ranks_in_all_subgroups, int level_one_size);
 
 /* #define NEW_LEADER_SELECTION */
 
@@ -66,28 +69,28 @@ struct ranks_proxy_t {
 };
 typedef struct rank_proxy_t rank_proxy_t;
 
-#define PROVIDE_SUFFICIENT_MEMORY(ptr, dummy_ptr, ptr_size, unit_type, in_use,  \
-                                                            n_to_add,n_to_grow) \
-do {                                                                            \
-    if ((in_use) + (n_to_add) > (ptr_size)) {                                   \
-        (dummy_ptr) = (unit_type *)                                             \
-                  realloc(ptr, sizeof(unit_type) * ((ptr_size) + (n_to_grow))); \
-        if (NULL != (dummy_ptr)) {                                              \
-            (ptr) = (dummy_ptr);                                                \
-            (ptr_size) += (n_to_grow);                                          \
-        }                                                                       \
-    }                                                                           \
-} while (0)
+#define PROVIDE_SUFFICIENT_MEMORY(ptr, dummy_ptr, ptr_size, unit_type, in_use, \
+				  n_to_add,n_to_grow)			\
+    do {                                                                \
+        if ((in_use) + (n_to_add) > (ptr_size)) {                       \
+            (dummy_ptr) = (unit_type *)					\
+                realloc(ptr, sizeof(unit_type) * ((ptr_size) + (n_to_grow))); \
+            if (NULL != (dummy_ptr)) {					\
+                (ptr) = (dummy_ptr);                                    \
+                (ptr_size) += (n_to_grow);                              \
+            }                                                           \
+        }                                                               \
+    } while (0)
 
 /*
  * Local functions
  */
 
 static int ml_module_enable(mca_coll_base_module_t *module,
-        struct ompi_communicator_t *comm);
+			    struct ompi_communicator_t *comm);
 
 static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo,
-        ompi_communicator_t *comm);
+					 ompi_communicator_t *comm);
 
 static void
 mca_coll_ml_module_construct(mca_coll_ml_module_t *module)
@@ -131,7 +134,7 @@ mca_coll_ml_module_construct(mca_coll_ml_module_t *module)
         topo->topo_ordering_info.num_bcols_need_ordering = 0;
 
         memset(topo->hierarchical_algorithms, 0,
-                BCOL_NUM_OF_FUNCTIONS * sizeof(coll_ml_collective_description_t *));
+               BCOL_NUM_OF_FUNCTIONS * sizeof(coll_ml_collective_description_t *));
     }
 
     for (coll_i = 0; coll_i < ML_NUM_OF_FUNCTIONS; coll_i++) {
@@ -148,6 +151,14 @@ mca_coll_ml_module_construct(mca_coll_ml_module_t *module)
     OBJ_CONSTRUCT(&(module->waiting_for_memory_list), opal_list_t);
 }
 
+#define ML_RELEASE_FALLBACK(_coll_ml, _coll)                            \
+    do {                                                                \
+        if (_coll_ml->fallback.coll_ ## _coll ## _module) {             \
+            OBJ_RELEASE(_coll_ml->fallback.coll_ ## _coll ## _module);  \
+            _coll_ml->fallback.coll_ ## _coll ## _module = NULL;        \
+        }                                                               \
+    } while (0);
+
 static void
 mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
 {
@@ -193,13 +204,6 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
             if(NULL != topo->array_of_all_subgroups) {
                 for( k=0 ; k < topo->number_of_all_subgroups ; k++ ) {
                     if(0 < topo->array_of_all_subgroups[k].n_ranks) {
-                        for(i=0 ; i < topo->array_of_all_subgroups[k].n_ranks ; i++ )
-                        {
-                            if(0 < topo->array_of_all_subgroups[k].rank_data[i].n_connected_subgroups) {
-                                free(topo->array_of_all_subgroups[k].rank_data[i].list_connected_subgroups);
-                                topo->array_of_all_subgroups[k].rank_data[i].list_connected_subgroups=NULL;
-                            }
-                        }
                         free(topo->array_of_all_subgroups[k].rank_data);
                         topo->array_of_all_subgroups[k].rank_data = NULL;
                     }
@@ -207,11 +211,6 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
                 free(topo->array_of_all_subgroups);
                 topo->array_of_all_subgroups = NULL;
             }
-
-            if(NULL != topo->sort_list) {
-                free(topo->sort_list);
-                topo->sort_list=NULL;
-            }
         }
 
         OBJ_DESTRUCT(&(module->active_bcols_list));
@@ -232,6 +231,16 @@ mca_coll_ml_module_destruct(mca_coll_ml_module_t *module)
         if (NULL != module->coll_ml_barrier_function) {
             free(module->coll_ml_barrier_function);
         }
+
+        /* release saved collectives */
+        ML_RELEASE_FALLBACK(module, allreduce);
+        ML_RELEASE_FALLBACK(module, allgather);
+        ML_RELEASE_FALLBACK(module, reduce);
+        ML_RELEASE_FALLBACK(module, ibcast);
+        ML_RELEASE_FALLBACK(module, iallreduce);
+        ML_RELEASE_FALLBACK(module, iallgather);
+        ML_RELEASE_FALLBACK(module, ireduce);
+        ML_RELEASE_FALLBACK(module, ibcast);
     }
 }
 
@@ -248,11 +257,14 @@ static int mca_coll_ml_request_free(ompi_request_t** request)
 
     /* this fragement does not hold the message data, so ok to return */
     assert(0 == ml_request->pending);
-    assert(0 == ml_request->fragment_data.offset_into_user_buffer);
+    //assert(0 == ml_request->fragment_data.offset_into_user_buffer);
+    assert(&ml_request->full_message == ml_request->fragment_data.message_descriptor);
     assert(ml_request->dag_description.status_array[0].item.opal_list_item_refcount == 0);
     ML_VERBOSE(10, ("Releasing Master %p", ml_request));
+    /* Mark the request as invalid */
+    OMPI_REQUEST_FINI(&ml_request->full_message.super);
     OMPI_FREE_LIST_RETURN_MT(&(ml_module->coll_ml_collective_descriptors),
-            (ompi_free_list_item_t *)ml_request);
+                             (ompi_free_list_item_t *)ml_request);
 
     /* MPI needs to return with the request object set to MPI_REQUEST_NULL
      */
@@ -268,7 +280,6 @@ static void mca_coll_ml_collective_operation_progress_construct
     /* initialize pointer */
     desc->dag_description.status_array = NULL;
 
-    OBJ_CONSTRUCT(&desc->full_message.super, ompi_request_t);
     OBJ_CONSTRUCT(&desc->full_message.send_convertor, opal_convertor_t);
     OBJ_CONSTRUCT(&desc->full_message.recv_convertor, opal_convertor_t);
 
@@ -295,9 +306,9 @@ static void mca_coll_ml_collective_operation_progress_construct
 
 /* destructor for collective managment descriptor */
 static void mca_coll_ml_collective_operation_progress_destruct
-               (mca_coll_ml_collective_operation_progress_t *desc) {
+(mca_coll_ml_collective_operation_progress_t *desc) {
     mca_coll_ml_module_t *ml_module =
-            (mca_coll_ml_module_t *) desc->coll_module;
+        (mca_coll_ml_module_t *) desc->coll_module;
 
     int i, max_dag_size = ml_module->max_dag_size;
 
@@ -310,7 +321,6 @@ static void mca_coll_ml_collective_operation_progress_destruct
         desc->dag_description.status_array = NULL;
     }
 
-    OBJ_DESTRUCT(&desc->full_message.super);
     OBJ_DESTRUCT(&desc->full_message.send_convertor);
     OBJ_DESTRUCT(&desc->full_message.recv_convertor);
 
@@ -322,11 +332,11 @@ static void mca_coll_ml_collective_operation_progress_destruct
 static void init_ml_fragment_desc(ompi_free_list_item_t *desc , void* ctx);
 static void init_ml_message_desc(ompi_free_list_item_t *desc , void* ctx)
 {
-   mca_coll_ml_module_t *module= (mca_coll_ml_module_t *) ctx;
-   mca_coll_ml_descriptor_t *msg_desc = (mca_coll_ml_descriptor_t *) desc;
+    mca_coll_ml_module_t *module= (mca_coll_ml_module_t *) ctx;
+    mca_coll_ml_descriptor_t *msg_desc = (mca_coll_ml_descriptor_t *) desc;
 
-   /* finish setting up the fragment descriptor */
-   init_ml_fragment_desc((ompi_free_list_item_t*)&(msg_desc->fragment),module);
+    /* finish setting up the fragment descriptor */
+    init_ml_fragment_desc((ompi_free_list_item_t*)&(msg_desc->fragment),module);
 }
 
 /* initialize the fragment descriptor - can pass in module specific
@@ -334,16 +344,16 @@ static void init_ml_message_desc(ompi_free_list_item_t *desc , void* ctx)
  */
 static void init_ml_fragment_desc(ompi_free_list_item_t *desc , void* ctx)
 {
-   mca_coll_ml_module_t *module= (mca_coll_ml_module_t *) ctx;
-   mca_coll_ml_fragment_t *frag_desc = (mca_coll_ml_fragment_t *) desc;
+    mca_coll_ml_module_t *module= (mca_coll_ml_module_t *) ctx;
+    mca_coll_ml_fragment_t *frag_desc = (mca_coll_ml_fragment_t *) desc;
 
-  /* allocated array of function arguments */
-  /* RLG - we have a problem if we don't get the memory */
-   /* malloc-debug does not like zero allocations */
-   if (module->max_fn_calls > 0) {
-       frag_desc->fn_args = (bcol_function_args_t *)
-                      malloc(sizeof(bcol_function_args_t) * module->max_fn_calls);
-   }
+    /* allocated array of function arguments */
+    /* RLG - we have a problem if we don't get the memory */
+    /* malloc-debug does not like zero allocations */
+    if (module->max_fn_calls > 0) {
+        frag_desc->fn_args = (bcol_function_args_t *)
+            malloc(sizeof(bcol_function_args_t) * module->max_fn_calls);
+    }
 
 }
 static void mca_coll_ml_bcol_list_item_construct(mca_coll_ml_bcol_list_item_t *item)
@@ -381,14 +391,14 @@ static void generate_active_bcols_list(mca_coll_ml_module_t *ml_module)
                  * for memory synchronization (for instance - ptpcoll )*/
                 if (NULL == GET_BCOL_SYNC_FN(bcol_module)) {
                     ML_VERBOSE(10,(" No sync function was provided by bcol %s\n",
-                                bcol_module->bcol_component->bcol_version.mca_component_name));
+                                   bcol_module->bcol_component->bcol_version.mca_component_name));
                     continue;
                 }
 
                 bcol_was_found = false;
                 for(bcol_item = (mca_coll_ml_bcol_list_item_t *)opal_list_get_first(&ml_module->active_bcols_list);
                     !bcol_was_found &&
-                    bcol_item != (mca_coll_ml_bcol_list_item_t *)opal_list_get_end(&ml_module->active_bcols_list);
+                        bcol_item != (mca_coll_ml_bcol_list_item_t *)opal_list_get_end(&ml_module->active_bcols_list);
                     bcol_item = (mca_coll_ml_bcol_list_item_t *)opal_list_get_next((opal_list_item_t *)bcol_item)) {
                     if (bcol_module == bcol_item->bcol_module) {
                         bcol_was_found = true;
@@ -438,11 +448,11 @@ static int calculate_buffer_header_size(mca_coll_ml_module_t *ml_module)
                     if (offset < bcol_module->header_size) {
                         offset = bcol_module->header_size;
                     }
-               }
+                }
 
                 /* Set bcol mode bits */
                 topo->all_bcols_mode &= bcol_module->supported_mode;
-           }
+            }
         }
 
         offset = ((offset + BCOL_HEAD_ALIGN - 1) / BCOL_HEAD_ALIGN) * BCOL_HEAD_ALIGN;
@@ -501,9 +511,9 @@ static int mca_coll_ml_register_bcols(mca_coll_ml_module_t *ml_module)
                 bcol_module = topo->component_pairs[i].bcol_modules[j];
                 if (NULL != bcol_module->bcol_memory_init) {
                     ret = bcol_module->bcol_memory_init(ml_module,
-                            bcol_module,
-                            (NULL != bcol_module->network_context) ?
-                            bcol_module->network_context->context_data: NULL);
+                                                        bcol_module,
+                                                        (NULL != bcol_module->network_context) ?
+                                                        bcol_module->network_context->context_data: NULL);
                     if (OMPI_SUCCESS != ret) {
                         ML_ERROR(("Bcol registration failed on ml level!!"));
                         return ret;
@@ -537,8 +547,8 @@ static int ml_module_memory_initialization(mca_coll_ml_module_t *ml_module)
     ML_VERBOSE(10, ("Call for initialize block.\n"));
 
     ret = mca_coll_ml_initialize_block(ml_module->payload_block,
-            nbuffers, nbanks, buf_size, ml_module->data_offset,
-            NULL);
+                                       nbuffers, nbanks, buf_size, ml_module->data_offset,
+                                       NULL);
     if (OMPI_SUCCESS != ret) {
         return ret;
     }
@@ -559,8 +569,8 @@ static int ml_module_memory_initialization(mca_coll_ml_module_t *ml_module)
 
 /* do some sanity checks */
 static int check_global_view_of_subgroups( int n_procs_selected,
-        int n_procs_in, int ll_p1, int* all_selected,
-        mca_sbgp_base_module_t *module )
+					   int n_procs_in, int ll_p1, int* all_selected,
+					   mca_sbgp_base_module_t *module )
 {
     /* local variables */
     int ret=OMPI_SUCCESS;
@@ -597,8 +607,7 @@ static int check_global_view_of_subgroups( int n_procs_selected,
         }
     }
     if( sum != n_procs_selected ) {
-        fprintf(stderr,"n procs in %d\n",n_procs_in);
-        ML_VERBOSE(0, ("number of procs in the group unexpeted.  Expected %d Got %d\n",n_procs_selected,sum));
+        ML_VERBOSE(0, ("number of procs in the group unexpected.  Expected %d Got %d\n",n_procs_selected,sum));
         ret=OMPI_ERROR;
         goto exit_ERROR;
     }
@@ -606,7 +615,7 @@ static int check_global_view_of_subgroups( int n_procs_selected,
      */
     for (i = 0; i < n_procs_selected; i++) {
         if(ll_p1!=all_selected[module->group_list[i]] &&
-                ll_p1!=-all_selected[module->group_list[i]] ) {
+           ll_p1!=-all_selected[module->group_list[i]] ) {
             ret=OMPI_ERROR;
             ML_VERBOSE(0, ("Mismatch in rank list - element #%d - %d \n",i,all_selected[module->group_list[i]]));
             goto exit_ERROR;
@@ -616,130 +625,128 @@ static int check_global_view_of_subgroups( int n_procs_selected,
     /* return */
     return ret;
 
-exit_ERROR:
+ exit_ERROR:
     /* return */
     return ret;
 }
 
-
 static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ranks_in_all_subgroups, int my_rank_in_list)
 {
     int *list_n_connected;
     int group_size, rank, i, j, knt, offset, k, my_sbgp = 0;
-    int my_root;
-    int level_one_knt;
+    int my_root, level_one_knt;
     sub_group_params_t *array_of_all_subgroup_ranks = topo->
-                                      array_of_all_subgroups;
+        array_of_all_subgroups;
     int num_total_subgroups = topo->number_of_all_subgroups;
     int n_hier = topo->n_levels;
 
     hierarchy_pairs *pair = NULL;
     mca_coll_ml_leader_offset_info_t *loc_leader = (mca_coll_ml_leader_offset_info_t *)
-                                                  malloc(sizeof(mca_coll_ml_leader_offset_info_t)*(n_hier+1));
+        malloc(sizeof(mca_coll_ml_leader_offset_info_t)*(n_hier+1));
 
     /* first thing I want to know is where does the first level end */
     level_one_knt = 0;
+
     while( 0 == array_of_all_subgroup_ranks[level_one_knt].level_in_hierarchy &&
-            level_one_knt < num_total_subgroups){
+           level_one_knt < num_total_subgroups){
         level_one_knt++;
     }
-    /*
-    fprintf(stderr,"PPP %d %d %d \n", level_one_knt, array_of_all_subgroup_ranks[0].level_in_hierarchy, num_total_subgroups);
-     */
+
+    /* fprintf(stderr,"PPP %d %d %d \n", level_one_knt, array_of_all_subgroup_ranks[0].level_in_hierarchy, num_total_subgroups); */
+
     /* I want to cache this number for unpack*/
     array_of_all_subgroup_ranks->level_one_index = level_one_knt;
 
     /* determine whether or not ranks are contiguous */
     topo->ranks_contiguous = true;
-    knt = 0;
-    for( i = 0; i < level_one_knt; i++){
-       for( j =0; j < array_of_all_subgroup_ranks[i].n_ranks; j++){
-           if(knt != list_of_ranks_in_all_subgroups[knt]){
-               topo->ranks_contiguous = false;
-               i = level_one_knt;
-               break;
-           }
-           knt++;
-       }
+    for (i = 0, knt = 0 ; i < level_one_knt && topo->ranks_contiguous ; ++i) {
+        for (j = 0 ; j < array_of_all_subgroup_ranks[i].n_ranks ; ++j, ++knt) {
+            if (knt != list_of_ranks_in_all_subgroups[knt]) {
+                topo->ranks_contiguous = false;
+                break;
+            }
+        }
     }
 
+    loc_leader[0].offset = 0;
+
     /* now find my first level offset, and my index in level one */
-    knt = 0;
-    for(i = 0; i < level_one_knt; i++){
+    for (i = 0, loc_leader[0].level_one_index = -1 ; i < level_one_knt ; ++i) {
         offset = array_of_all_subgroup_ranks[i].index_of_first_element;
-        for( k = 0; k < array_of_all_subgroup_ranks[i].n_ranks; k++){
+        for (k = 0 ; k < array_of_all_subgroup_ranks[i].n_ranks ; ++k) {
             rank = list_of_ranks_in_all_subgroups[k + offset];
-            if(rank == my_rank_in_list){
-                loc_leader[0].offset = knt;
+            if (rank == my_rank_in_list) {
+                loc_leader[0].offset = offset;
                 loc_leader[0].level_one_index = k;
                 i = level_one_knt;
                 break;
             }
         }
-        knt += array_of_all_subgroup_ranks[i].n_ranks;
     }
 
+    /* every rank MUST appear at level 0 */
+    assert (loc_leader[0].level_one_index > -1);
 
-
-    for(i = 0; i < n_hier; i++){
+    for (i = 0 ; i < n_hier ; ++i) {
         pair = &topo->component_pairs[i];
         /* find the size of the group */
         group_size = pair->subgroup_module->group_size;
         /* malloc some memory for the new list to cache
            on the bcol module
-         */
+        */
         list_n_connected = (int *) malloc(sizeof(int)*group_size);
+
         /* next thing to do is to find out which subgroup I'm in
          * at this particular level
          */
-        knt = 0;
-        for( j = 0; j < num_total_subgroups; j++){
+        for (j = 0, knt = 0, my_sbgp = -1 ; j < num_total_subgroups && 0 > my_sbgp ; ++j) {
             offset = array_of_all_subgroup_ranks[j].index_of_first_element;
-            for( k = 0; k < array_of_all_subgroup_ranks[j].n_ranks; k++){
+
+            /* in the 1-level case just skip any group of size 1 and move on
+             * to the real group. */
+            if (1 == n_hier && 1 == array_of_all_subgroup_ranks[j].n_ranks) {
+                continue;
+            }
+
+            for (k = 0; k < array_of_all_subgroup_ranks[j].n_ranks; k++) {
                 rank = list_of_ranks_in_all_subgroups[k+offset];
-                if(rank == my_rank_in_list){
-                    knt++;
-                }
-                if(knt == (i+1)){
+                /* we can not use the level_in_topology flag to determine the
+                 * level since not all levels may be represented so keep a count
+                 * of the number of times this ranks shows up. when it has been
+                 * seen the correct number of times we are done. */
+                if (rank == my_rank_in_list && ++knt == (i+1)){
                     my_sbgp = j;
                     /* tag whether I am a local leader or not at this level */
-                    if( my_rank_in_list == array_of_all_subgroup_ranks[j].root_rank_in_comm){
-                        loc_leader[i].leader = true;
-                    } else {
-                        loc_leader[i].leader = false;
-                    }
-                    j = num_total_subgroups;
+                    loc_leader[i].leader = (my_rank_in_list == array_of_all_subgroup_ranks[j].root_rank_in_comm);
                     break;
                 }
             }
         }
 
-        for( j = 0; j < group_size; j++ ) {
+        /* should have found a subgroup */
+        assert (my_sbgp > -1);
+
+        for (j = 0 ; j < group_size ; ++j) {
             list_n_connected[j] = array_of_all_subgroup_ranks[my_sbgp].
                 rank_data[j].num_of_ranks_represented;
         }
 
         /* now find all sbgps that the root of this sbgp belongs to
-           previous to this "my_sbgp"
-         */
-
+         * previous to this "my_sbgp" */
         my_root = array_of_all_subgroup_ranks[my_sbgp].root_rank_in_comm;
-        knt=0;
-        for(j = 0; j < my_sbgp; j++){
-            if(array_of_all_subgroup_ranks[j].root_rank_in_comm ==
-                    my_root){
-                for(k = 1; k < array_of_all_subgroup_ranks[j].n_ranks;
-                        k++){
+
+        for (j = 0, knt = 0 ; j < my_sbgp ; ++j) {
+            if (array_of_all_subgroup_ranks[j].root_rank_in_comm == my_root) {
+                for (k = 1; k < array_of_all_subgroup_ranks[j].n_ranks; ++k) {
                     knt += array_of_all_subgroup_ranks[j].rank_data[k].
                         num_of_ranks_represented;
                 }
 
             }
         }
+
         /* and then I add one for the root itself */
-        list_n_connected[0] = knt+1;
-
-
+        list_n_connected[0] = knt + 1;
 
         /* now cache this on the bcol module */
         pair->bcol_modules[0]->list_n_connected = list_n_connected;
@@ -754,34 +761,27 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
         if (i > 0) {
             /* if I'm not the local leader */
             if( !loc_leader[i].leader) {
-                knt = 0;
                 /* then I am not a local leader at this level */
                 offset = array_of_all_subgroup_ranks[my_sbgp].index_of_first_element;
-                for( k = 0; k < array_of_all_subgroup_ranks[my_sbgp].n_ranks; k++){
+                for (k = 0, knt = 0 ; k < array_of_all_subgroup_ranks[my_sbgp].n_ranks ; ++k) {
                     rank = list_of_ranks_in_all_subgroups[k+offset];
-                    if(rank == my_rank_in_list){
+                    if (rank == my_rank_in_list) {
                         break;
-                    } else {
-
-                        knt += list_n_connected[k];
                     }
+
+                    knt += list_n_connected[k];
                 }
                 loc_leader[i].offset = loc_leader[i-1].offset - knt;
-                pair->bcol_modules[0]->hier_scather_offset = loc_leader[i].offset;
-            }else{
-               /* if I am the local leader, then keep the same offset */
-               loc_leader[i].offset = loc_leader[i-1].offset;
-                pair->bcol_modules[0]->hier_scather_offset = loc_leader[i-1].offset;
+            } else {
+                /* if I am the local leader, then keep the same offset */
+                loc_leader[i].offset = loc_leader[i-1].offset;
             }
-        } else {
-
-           pair->bcol_modules[0]->hier_scather_offset = loc_leader[0].offset;
         }
 
+        pair->bcol_modules[0]->hier_scather_offset = loc_leader[i].offset;
+
         /*setup the tree */
         pair->bcol_modules[0]->k_nomial_tree(pair->bcol_modules[0]);
-
-
     }
 
     /* see if I am in the last subgroup, if I am,
@@ -801,151 +801,33 @@ static void ml_init_k_nomial_trees(mca_coll_ml_topology_t *topo, int *list_of_ra
     if(loc_leader[n_hier - 1].leader){
         loc_leader[n_hier].leader = true;
     } else {
-       loc_leader[n_hier].leader = false;
+        loc_leader[n_hier].leader = false;
     }
 
     /* what other goodies do I want to cache on the ml-module? */
     topo->hier_layout_info = loc_leader;
 }
 
-/* for a given rank in a subgroup, find out the number of ranks this subgroup
- * represents.  It uses a depth-first search to recursively traverse
- * subgroups conncted to the subgroup.
- */
-
-static int ml_compute_number_unique_proxy_ranks(
-        int subgroup_index, int rank_index,
-        int *sub_groups_in_lineage,int *len_sub_groups_in_lineage,
-        sub_group_params_t *array_of_all_subgroup_ranks)
-{
-    /* local variables */
-    int total=0, i_rank, sg_i, sub_grp, depth;
-    bool found;
-
-    /* Do I represent several subgroups ? */
-    if( array_of_all_subgroup_ranks[subgroup_index].rank_data[rank_index].
-            n_connected_subgroups ) {
-        for( sg_i = 0 ; sg_i <
-                array_of_all_subgroup_ranks[subgroup_index].
-                rank_data[rank_index].n_connected_subgroups ; sg_i++ ) {
-            sub_grp= array_of_all_subgroup_ranks[subgroup_index].
-                rank_data[rank_index].list_connected_subgroups[sg_i];
-
-            /* make sure we don't loop back on ourselves */
-            found=false;
-            for(depth=0 ; depth < *len_sub_groups_in_lineage
-                    ; depth++ ){
-                if(sub_groups_in_lineage[depth]==sub_grp)
-                {
-                    found=true;
-                    break;
-                }
-            }
-            if(found) {
-                continue;
-            }
-
-            sub_groups_in_lineage[(*len_sub_groups_in_lineage)]=sub_grp;
-            (*len_sub_groups_in_lineage)++;
-            for(i_rank = 0 ;
-                    i_rank < array_of_all_subgroup_ranks[sub_grp].n_ranks ;
-                    i_rank++) {
-                total+=ml_compute_number_unique_proxy_ranks(
-                        sub_grp, i_rank, sub_groups_in_lineage,
-                        len_sub_groups_in_lineage, array_of_all_subgroup_ranks);
-            }
-            (*len_sub_groups_in_lineage)--;
-        }
-    }
-    /* if I am a leaf, count me */
-    if( array_of_all_subgroup_ranks[subgroup_index].rank_data[rank_index].
-            leaf ) {
-        total++;
-    }
-
-    /* return */
-    return total;
-
-}
-
-static void ml_compute_create_unique_proxy_rank_list(
-        int subgroup_index,
-        int *sub_groups_in_lineage,int *len_sub_groups_in_lineage,
-        sub_group_params_t *array_of_all_subgroup_ranks,
-        int *current_list_length, int *sorted_rank_list)
-{
-    /* local variables */
-    int i_rank, sg_i, sub_grp, depth;
-    bool found;
-
-    /* loop over all the element of ths subgroup */
-    for(i_rank = 0 ; i_rank < array_of_all_subgroup_ranks[subgroup_index].n_ranks ;
-            i_rank++) {
-        if(array_of_all_subgroup_ranks[subgroup_index].rank_data[i_rank].leaf){
-            /* found leaf - add to the list */
-            sorted_rank_list[(*current_list_length)]=
-                array_of_all_subgroup_ranks[subgroup_index].rank_data[i_rank].rank;
-            (*current_list_length)++;
-        }
-        if( array_of_all_subgroup_ranks[subgroup_index].rank_data[i_rank].
-                n_connected_subgroups ) {
-            /* loop over all connected subgroups */
-            for( sg_i = 0 ; sg_i <
-                    array_of_all_subgroup_ranks[subgroup_index].
-                    rank_data[i_rank].n_connected_subgroups ; sg_i++ ) {
-                sub_grp= array_of_all_subgroup_ranks[subgroup_index].
-                    rank_data[i_rank].list_connected_subgroups[sg_i];
-
-                /* make sure we don't loop back on ourselves */
-                found=false;
-                for(depth=0 ; depth < *len_sub_groups_in_lineage
-                        ; depth++ ){
-                    if(sub_groups_in_lineage[depth]==sub_grp)
-                    {
-                        found=true;
-                        break;
-                    }
-                }
-                if(found) {
-                    continue;
-                }
-
-                sub_groups_in_lineage[(*len_sub_groups_in_lineage)]=sub_grp;
-                (*len_sub_groups_in_lineage)++;
-                ml_compute_create_unique_proxy_rank_list(
-                        sub_grp, sub_groups_in_lineage,
-                        len_sub_groups_in_lineage, array_of_all_subgroup_ranks,
-                        current_list_length, sorted_rank_list);
-                (*len_sub_groups_in_lineage)--;
-            }
-        }
-    }
-    return;
-
-}
-
 static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
-        ompi_communicator_t *comm,
-        int my_highest_group_index, int *map_to_comm_ranks,
-        int *num_total_subgroups, sub_group_params_t **array_of_all_subgroup_ranks,
-        int **list_of_ranks_in_all_subgroups)
+				   ompi_communicator_t *comm,
+				   int my_highest_group_index, int *map_to_comm_ranks,
+				   int *num_total_subgroups, sub_group_params_t **array_of_all_subgroup_ranks,
+				   int **list_of_ranks_in_all_subgroups)
 {
 
     int ret = OMPI_SUCCESS;
-    int i, j, k, in_buf, root, my_rank,sum;
+    int i, in_buf, root, my_rank,sum;
     int in_num_total_subgroups = *num_total_subgroups;
-    int i_sg, i_cnt, i_rank, i_offset, i_level, j_sg, j_rank,
-        j_level, j_root,cnt, rank, rank_cnt;
     int *scratch_space = NULL;
-    bool found;
+
     /* figure out who holds all the sub-group information - only those
      * ranks in the top level know this data at this point */
     my_rank = ompi_comm_rank(comm);
     if( (my_highest_group_index == topo->global_highest_hier_group_index )
-            &&
-            ( my_rank ==
-            topo->component_pairs[topo->n_levels-1].subgroup_module->group_list[0])
-            ) {
+        &&
+        ( my_rank ==
+          topo->component_pairs[topo->n_levels-1].subgroup_module->group_list[0])
+        ) {
         in_buf=my_rank;
     } else {
         /* since this will be a sum allreduce - contributing 0 will not
@@ -953,9 +835,9 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
         in_buf=0;
     }
     ret = comm_allreduce_pml(&in_buf, &root, 1, MPI_INT,
-            my_rank, MPI_SUM,
-            ompi_comm_size(comm), map_to_comm_ranks,
-            comm);
+                             my_rank, MPI_SUM,
+                             ompi_comm_size(comm), map_to_comm_ranks,
+                             comm);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
         ML_VERBOSE(10, ("comm_allreduce_pml failed. root reduction\n"));
         goto exit_ERROR;
@@ -963,8 +845,8 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
 
     /* broadcast the number of groups */
     ret=comm_bcast_pml(num_total_subgroups, root, 1,
-            MPI_INT, my_rank, ompi_comm_size(comm),
-            map_to_comm_ranks,comm);
+                       MPI_INT, my_rank, ompi_comm_size(comm),
+                       map_to_comm_ranks,comm);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
         ML_VERBOSE(10, ("comm_bcast_pml failed. num_total_subgroups bcast\n"));
         goto exit_ERROR;
@@ -976,6 +858,7 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
         ret = OMPI_ERR_OUT_OF_RESOURCE;
         goto exit_ERROR;
     }
+
     if( my_rank == root ) {
         sum=0;
         for(i=0 ; i < (*num_total_subgroups) ; i++ ) {
@@ -986,8 +869,8 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
         }
     }
     ret=comm_bcast_pml(scratch_space, root, 4*(*num_total_subgroups),
-            MPI_INT, my_rank, ompi_comm_size(comm),
-            map_to_comm_ranks, comm);
+                       MPI_INT, my_rank, ompi_comm_size(comm),
+                       map_to_comm_ranks, comm);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
         ML_VERBOSE(10, ("comm_allreduce_pml failed. scratch_space bcast\n"));
         goto exit_ERROR;
@@ -1022,72 +905,20 @@ static int ml_setup_full_tree_data(mca_coll_ml_topology_t *topo,
     if( in_num_total_subgroups != (*num_total_subgroups) ) {
         (*list_of_ranks_in_all_subgroups)=(int *)
             realloc((*list_of_ranks_in_all_subgroups),sizeof(int)*sum);
-            if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) {
-                ML_VERBOSE(10, ("Cannot allocate memory *list_of_ranks_in_all_subgroups.\n"));
-                ret = OMPI_ERR_OUT_OF_RESOURCE;
-                goto exit_ERROR;
-            }
+        if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) {
+            ML_VERBOSE(10, ("Cannot allocate memory *list_of_ranks_in_all_subgroups.\n"));
+            ret = OMPI_ERR_OUT_OF_RESOURCE;
+            goto exit_ERROR;
+        }
     }
-    ret=comm_bcast_pml(*list_of_ranks_in_all_subgroups, root, sum,
-            MPI_INT, my_rank, ompi_comm_size(comm),
-            map_to_comm_ranks, comm);
+    ret = comm_bcast_pml(*list_of_ranks_in_all_subgroups, root, sum,
+                         MPI_INT, my_rank, ompi_comm_size(comm),
+                         map_to_comm_ranks, comm);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
         ML_VERBOSE(10, ("Bcast failed for list_of_ranks_in_all_subgroups \n"));
         goto exit_ERROR;
     }
 
-    /* fill in subgroup ranks */
-    for(i=0 ; i < (*num_total_subgroups) ; i++ ) {
-        int k=(*array_of_all_subgroup_ranks)[i].index_of_first_element;
-        sum=(*array_of_all_subgroup_ranks)[i].n_ranks;
-        (*array_of_all_subgroup_ranks)[i].rank_data=(rank_properties_t *)
-            malloc(sizeof(rank_properties_t)*sum);
-        if (OPAL_UNLIKELY(NULL ==
-                    (*array_of_all_subgroup_ranks)[i].rank_data ) ) {
-
-            ML_VERBOSE(10, ("Cannot allocate memory for rank_data \n"));
-            ret = OMPI_ERR_OUT_OF_RESOURCE;
-            goto exit_ERROR;
-        }
-        for(j=0 ; j < (*array_of_all_subgroup_ranks)[i].n_ranks ; j++ ) {
-            (*array_of_all_subgroup_ranks)[i].rank_data[j].rank=
-                (*list_of_ranks_in_all_subgroups)[k+j];
-            /* initial value - an element is not a leaf only at the
-             * first lowest level that it shows up in the tree */
-            (*array_of_all_subgroup_ranks)[i].rank_data[j].leaf=0;
-        }
-    }
-
-    /* find the first occurance of a rank in the tree */
-    for(rank = 0; rank < ompi_comm_size(comm); rank++) {
-        for( i=0 ; i < (*num_total_subgroups) ; i++ ) {
-            for(j=0 ; j < (*array_of_all_subgroup_ranks)[i].n_ranks ; j++ ) {
-                if( rank ==
-                        (*array_of_all_subgroup_ranks)[i].rank_data[j].rank ) {
-                    (*array_of_all_subgroup_ranks)[i].rank_data[j].leaf=1;
-                    goto NextRank;
-                }
-            }
-        }
-NextRank:
-        continue;
-    }
-
-    /* figure out the index of the root in the subgroup */
-    for(i_sg=0 ; i_sg < (*num_total_subgroups); i_sg++) {
-        int root=(*array_of_all_subgroup_ranks)[i_sg].root_rank_in_comm;
-        i_cnt=(*array_of_all_subgroup_ranks)[i_sg].n_ranks;
-        i_offset=(*array_of_all_subgroup_ranks)[i_sg].index_of_first_element;
-        for(i_rank =0 ; i_rank < i_cnt ; i_rank++ ) {
-            rank=(*list_of_ranks_in_all_subgroups)[i_offset+i_rank];
-            if(rank==root) {
-                /* this is the root */
-                (*array_of_all_subgroup_ranks)[i_sg].root_index=i_rank;
-            }
-        }
-    }
-
-
     /*
      *  The data that is needed for a given rooted operation is:
      *    - subgroup,rank information for the source of the data.
@@ -1132,265 +963,40 @@ NextRank:
      * data associated with them.
      */
 
-    /* initialize the array */
-    for(i_sg=0 ; i_sg < (*num_total_subgroups); i_sg++) {
-        i_cnt=(*array_of_all_subgroup_ranks)[i_sg].n_ranks;
-        (*array_of_all_subgroup_ranks)[i_sg].n_connected_nodes=0;
-        for(i_rank =0 ; i_rank < i_cnt ; i_rank++ ) {
-            (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].
-                n_connected_subgroups=0;
-        }
-    }
-
-    for(i_sg=(*num_total_subgroups)-1; i_sg >= 0 ; i_sg--) {
-        i_cnt=(*array_of_all_subgroup_ranks)[i_sg].n_ranks;
-        i_level=(*array_of_all_subgroup_ranks)[i_sg].level_in_hierarchy;
-        i_offset=(*array_of_all_subgroup_ranks)[i_sg].index_of_first_element;
-        for(i_rank =0 ; i_rank < i_cnt ; i_rank++ ) {
-            rank=(*list_of_ranks_in_all_subgroups)[i_offset+i_rank];
-            for(j_sg=i_sg-1; j_sg >= 0 ; j_sg--) {
-                j_level=(*array_of_all_subgroup_ranks)[j_sg].level_in_hierarchy;
-                j_root=(*array_of_all_subgroup_ranks)[j_sg].root_rank_in_comm;
-                if(i_level == j_level ) {
-                    /* no overlap ==> not connections between groups at the
-                     * same level
-                     */
-                    continue;
-                }
-                if(rank == j_root ){
-                    /* do not connect to i_sg, if there is already a connection
-                     * to a subgroup with the same root higher up in the tree */
-                    found=false;
-                    for( k= i_sg-1 ; k > j_sg ; k-- ) {
-                        if( rank ==
-                                (*array_of_all_subgroup_ranks)[k].root_rank_in_comm ) {
-                            found=true;
-                            break;
-                        }
-                    }
-                    if(found) {
-                        /* the is not a vertex */
-                        continue;
-                    }
-                    /* found vertex */
-                    (*array_of_all_subgroup_ranks)[i_sg].n_connected_nodes++;
-                    (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].
-                        n_connected_subgroups++;
-
-                    (*array_of_all_subgroup_ranks)[j_sg].n_connected_nodes++;
-                    /* the connection "down" is to the local leader */
-                    j_rank=(*array_of_all_subgroup_ranks)[j_sg].root_index;
-                    (*array_of_all_subgroup_ranks)[j_sg].rank_data[j_rank].
-                        n_connected_subgroups++;
-
-                }
-            }
-        }
-    }
-    /* fill in connected nodes */
-    /* allocate memory for lists */
-    for(i_sg=0 ; i_sg < (*num_total_subgroups); i_sg++) {
-        i_cnt=(*array_of_all_subgroup_ranks)[i_sg].n_connected_nodes;
-        if( i_cnt > 0 ) {
-            (*array_of_all_subgroup_ranks)[i_sg].list_connected_nodes=
-                (int *)malloc(sizeof(int)*i_cnt);
-            if (OPAL_UNLIKELY(NULL ==
-                        (*array_of_all_subgroup_ranks)[i_sg].list_connected_nodes)) {
-                ML_VERBOSE(10, ("Cannot allocate memory for list_connected_nodes - i_cnt %d\n",i_cnt));
-                ret = OMPI_ERR_OUT_OF_RESOURCE;
-                goto exit_ERROR;
-            }
-        } else {
-            (*array_of_all_subgroup_ranks)[i_sg].list_connected_nodes=NULL;
-        }
-        /* we will use this as a counter when we fill in the list of ranks */
-        (*array_of_all_subgroup_ranks)[i_sg].n_connected_nodes=0;
-
-        i_cnt=(*array_of_all_subgroup_ranks)[i_sg].n_ranks;
-        i_offset=(*array_of_all_subgroup_ranks)[i_sg].index_of_first_element;
-        for(i_rank =0 ; i_rank < i_cnt ; i_rank++ ) {
-            cnt= (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].
-                n_connected_subgroups;
-            if( 0 == cnt) {
-                /* no memory to allocate */
-                (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].list_connected_subgroups=NULL;
-                continue;
-            }
-            (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].list_connected_subgroups=
-                (int *)malloc(sizeof(int)*cnt);
-            if (OPAL_UNLIKELY(NULL ==
-                        (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].list_connected_subgroups) ) {
-                ML_VERBOSE(10, ("Cannot allocate memory for rank list_connected_subgroups - cnt %d\n",cnt));
-                ret = OMPI_ERR_OUT_OF_RESOURCE;
-                goto exit_ERROR;
-            }
-            /* reset the conuter, so can fill it in on the fly */
-            (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].
-                n_connected_subgroups=0;
-        }
-    }
-
-    /* fill in the list of connected nodes */
-    for(i_sg=(*num_total_subgroups)-1; i_sg >= 0 ; i_sg--) {
-        i_cnt=(*array_of_all_subgroup_ranks)[i_sg].n_ranks;
-        i_level=(*array_of_all_subgroup_ranks)[i_sg].level_in_hierarchy;
-        i_offset=(*array_of_all_subgroup_ranks)[i_sg].index_of_first_element;
-        for(i_rank =0 ; i_rank < i_cnt ; i_rank++ ) {
-            rank=(*list_of_ranks_in_all_subgroups)[i_offset+i_rank];
-            for(j_sg=i_sg-1; j_sg >= 0 ; j_sg--) {
-                j_level=(*array_of_all_subgroup_ranks)[j_sg].level_in_hierarchy;
-                j_root=(*array_of_all_subgroup_ranks)[j_sg].root_rank_in_comm;
-                if(i_level == j_level ) {
-                    /* no overlap ==> not connections between groups at the
-                     * same level
-                     */
-                    continue;
-                }
-                if(rank == j_root ){
-                    /* do not connect to i_sg, if there is already a connection
-                     * to a subgroup with the same root higher up in the tree */
-                    found=false;
-                    for( k= i_sg-1 ; k > j_sg ; k-- ) {
-                        if( rank ==
-                                (*array_of_all_subgroup_ranks)[k].root_rank_in_comm ) {
-                            found=true;
-                            break;
-                        }
-                    }
-                    if(found) {
-                        /* the is not a vertex */
-                        continue;
-                    }
-                    /* found vertex */
-                    /*
-                     * connection "down"
-                     */
-                    cnt=(*array_of_all_subgroup_ranks)[i_sg].n_connected_nodes;
-                    (*array_of_all_subgroup_ranks)[i_sg].list_connected_nodes[cnt]
-                        =j_sg;
-                    (*array_of_all_subgroup_ranks)[i_sg].n_connected_nodes++;
-
-                    /* detailed per-rank information */
-                    cnt=(*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].
-                        n_connected_subgroups;
-                    (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].
-                        list_connected_subgroups[cnt]=j_sg;
-                    (*array_of_all_subgroup_ranks)[i_sg].rank_data[i_rank].
-                        n_connected_subgroups++;
-
-                    /* connection "up" */
-                    cnt=(*array_of_all_subgroup_ranks)[j_sg].n_connected_nodes;
-                    (*array_of_all_subgroup_ranks)[j_sg].list_connected_nodes[cnt]
-                        =i_sg;
-                    (*array_of_all_subgroup_ranks)[j_sg].n_connected_nodes++;
-
-                    /* detailed per-rank information */
-                    j_rank=(*array_of_all_subgroup_ranks)[j_sg].root_index;
-                    cnt=(*array_of_all_subgroup_ranks)[j_sg].rank_data[j_rank].
-                        n_connected_subgroups;
-                    (*array_of_all_subgroup_ranks)[j_sg].rank_data[j_rank].
-                        list_connected_subgroups[cnt]=i_sg;
-                    (*array_of_all_subgroup_ranks)[j_sg].rank_data[j_rank].
-                        n_connected_subgroups++;
-                }
-            }
-        }
-    }
-
-    /* figure out the number of ranks that each rank in the subgroups
-     * represnt.  scratch_space - is large enough for the scratch
-     * space that we need.
+    /* this function does a depth first traversal of the tree data and
+     * builds rank data and ensures that hierarchy level 0 is in the
+     * correct order for collective algorithms with per-rank data.
      */
-    for( i=0 ; i < (*num_total_subgroups) ; i++ ) {
-        for(j=0 ; j < (*array_of_all_subgroup_ranks)[i].n_ranks ; j++ ) {
-            scratch_space[0]=i;
-            cnt=1;
-            (*array_of_all_subgroup_ranks)[i].rank_data[j].num_of_ranks_represented=
-                ml_compute_number_unique_proxy_ranks(i,j,
-                        scratch_space,&cnt, *array_of_all_subgroup_ranks);
-        }
-    }
+    coll_ml_parse_topology (*array_of_all_subgroup_ranks, *num_total_subgroups,
+                            *list_of_ranks_in_all_subgroups, ompi_comm_size (comm));
 
-    /* compute the sort list when I am root */
-    topo->sort_list=(int *)
-        malloc(sizeof(int) * ompi_comm_size(comm));
-    if (OPAL_UNLIKELY(NULL == topo->sort_list)) {
-        ML_VERBOSE(10, ("Cannot allocate memory for sort_list.\n"));
-        ret = OMPI_ERR_OUT_OF_RESOURCE;
-        goto exit_ERROR;
-    }
-
-    /* find subgroup index, and rank within that subgroup where I am
-     * a leaf.
-     */
-    i_rank = -1;
-    i_level = -1;
-    found = false;
-    for(i = 0; i < (*num_total_subgroups); i++ ) {
-        for(j = 0; j < (*array_of_all_subgroup_ranks)[i].n_ranks; j++ ) {
-            if((ompi_comm_rank(comm) ==
-                        (*array_of_all_subgroup_ranks)[i].rank_data[j].rank)
-                    &&
-                    (*array_of_all_subgroup_ranks)[i].rank_data[j].leaf){
-                found = true;
-                /* rank */
-                i_rank = j;
-                /* subgroup index */
-                i_level = i;
-                break;
-            }
-        }
-        if(found){
-            break;
-        }
-    }
-    assert(found);
-
-    scratch_space[0] = i_level;
-    cnt = 1;
-    rank_cnt = 0;
-    ml_compute_create_unique_proxy_rank_list(
-        i_level, scratch_space, &cnt, *array_of_all_subgroup_ranks,
-        &rank_cnt, topo->sort_list);
+    /* The list of ranks in all subgroups is the same as the old sort list. This is the same
+     * order needed for both scatter and gather. */
+    topo->sort_list = (*list_of_ranks_in_all_subgroups);
 
     /* return */
-    if(scratch_space) {
+ exit_ERROR:
+    if (scratch_space) {
         free(scratch_space);
-    };
-    scratch_space=NULL;
-
-    return ret;
-
-exit_ERROR:
-    if(scratch_space) {
-        free(scratch_space);
-    };
-    scratch_space=NULL;
-
-    for(i_sg=0 ; i_sg < (*num_total_subgroups)-1; i_sg++) {
-        if((*array_of_all_subgroup_ranks)[i_sg].list_connected_nodes){
-            free((*array_of_all_subgroup_ranks)[i_sg].list_connected_nodes);
-            (*array_of_all_subgroup_ranks)[i_sg].list_connected_nodes=NULL;
-        }
     }
+
     return ret;
 }
 
 static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selected,
-        sub_group_params_t **sub_group_meta_data,
-        int *size_of_sub_group_meta_data,
-        int **list_of_ranks_in_all_subgroups,
-        int *size_of_list_of_ranks_in_all_subgroups,
-        int *num_ranks_in_list_of_ranks_in_all_subgroups,
-        int *num_total_subgroups,
-        int *map_to_comm_ranks, int level_in_hierarchy
-         ) {
+				  sub_group_params_t **sub_group_meta_data,
+				  int *size_of_sub_group_meta_data,
+				  int **list_of_ranks_in_all_subgroups,
+				  int *size_of_list_of_ranks_in_all_subgroups,
+				  int *num_ranks_in_list_of_ranks_in_all_subgroups,
+				  int *num_total_subgroups,
+				  int *map_to_comm_ranks, int level_in_hierarchy
+				  ) {
 
     /* local data */
     int rc=OMPI_SUCCESS;
     int rank_in_list,old_sg_size=(*num_total_subgroups);
     int sg_index, array_id, offset, sg_id;
-    bool found_sg;
     sub_group_params_t *dummy1 = NULL;
     int32_t **dummy2 = NULL;
     int32_t *dummy3 = NULL;
@@ -1422,35 +1028,32 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
         /* loop over existing groups, and see if this is a member of a new group
          * or if this group has already been found.
          */
-        found_sg=false;
-        sg_id=-1;
-        for( sg_index = old_sg_size ; sg_index < (*num_total_subgroups) ;
-                sg_index++ ) {
-            if( (*sub_group_meta_data)[sg_index].root_rank_in_comm ==
-                    sg_root) {
+        for (sg_index = old_sg_size, sg_id = -1 ; sg_index < (*num_total_subgroups) ; sg_index++) {
+            if ((*sub_group_meta_data)[sg_index].root_rank_in_comm == sg_root) {
                 /* add rank to the list */
                 (*sub_group_meta_data)[sg_index].n_ranks++;
-                sg_id=sg_index;
-                found_sg=true;
+                sg_id = sg_index;
                 break;
             }
         }
-        if( !found_sg) {
+
+        if (-1 == sg_id) {
             /* did not find existing sub-group, create new one */
             /* intialize new subgroup */
             PROVIDE_SUFFICIENT_MEMORY((*sub_group_meta_data), dummy1,
-                    (*size_of_sub_group_meta_data),
-                    sub_group_params_t, (*num_total_subgroups), 1, 5);
+                                      (*size_of_sub_group_meta_data),
+                                      sub_group_params_t, (*num_total_subgroups), 1, 5);
             /* do this for the temporary memory slots */
             PROVIDE_SUFFICIENT_MEMORY(temp, dummy2,
-                    knt1, int32_t *, knt2, 1, 5);
+                                      knt1, int32_t *, knt2, 1, 5);
             if (OPAL_UNLIKELY(NULL == (*sub_group_meta_data))) {
                 ML_VERBOSE(10, ("Cannot allocate memory for sub_group_meta_data.\n"));
                 rc = OMPI_ERR_OUT_OF_RESOURCE;
                 goto exit_ERROR;
             }
-            (*sub_group_meta_data)[(*num_total_subgroups)].root_rank_in_comm=sg_root;
-            (*sub_group_meta_data)[(*num_total_subgroups)].n_ranks=1;
+            (*sub_group_meta_data)[(*num_total_subgroups)].root_rank_in_comm = sg_root;
+            (*sub_group_meta_data)[(*num_total_subgroups)].n_ranks = 1;
+
             /* no need for this here - use a temporary ptr */
             temp[knt2]=
                 (int *)malloc(sizeof(int)*size_of_all_selected);
@@ -1459,22 +1062,14 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
                 rc = OMPI_ERR_OUT_OF_RESOURCE;
                 goto exit_ERROR;
             }
-            sg_id=(*num_total_subgroups);
-            (*num_total_subgroups)++;
-            knt2++;
-            knt3 = knt2;
+            sg_id = (*num_total_subgroups)++;
+            knt3 = ++knt2;
         } else {
             knt3 = sg_id - old_sg_size + 1;
         }
-        array_id=(*sub_group_meta_data)[sg_id].n_ranks-1;
-        temp[knt3-1][array_id] = current_rank_in_comm;
-        /* JSL This fixes a nasty memory bug thay vexed us for 3 hours */
-        /* XXX */
 
-        /*
-        (*sub_group_meta_data)[sg_id].list_ranks[array_id]=
-            current_rank_in_comm;
-        */
+        array_id = (*sub_group_meta_data)[sg_id].n_ranks-1;
+        temp[knt3-1][array_id] = current_rank_in_comm;
     }
 
     /* linearize the data - one rank will ship this to all the other
@@ -1482,9 +1077,9 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
      */
     /* make sure there is enough memory to hold the list */
     PROVIDE_SUFFICIENT_MEMORY((*list_of_ranks_in_all_subgroups),dummy3,
-            (*size_of_list_of_ranks_in_all_subgroups),
-            int, (*num_ranks_in_list_of_ranks_in_all_subgroups),
-            size_of_all_selected,size_of_all_selected);
+                              (*size_of_list_of_ranks_in_all_subgroups),
+                              int, (*num_ranks_in_list_of_ranks_in_all_subgroups),
+                              size_of_all_selected,size_of_all_selected);
     if (OPAL_UNLIKELY(NULL == (*list_of_ranks_in_all_subgroups))) {
         ML_VERBOSE(10, ("Cannot allocate memory for list_of_ranks_in_all_subgroups.\n"));
         rc = OMPI_ERR_OUT_OF_RESOURCE;
@@ -1498,7 +1093,7 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
         (*sub_group_meta_data)[sg_id].index_of_first_element=offset;
 
         for( array_id=0 ; array_id < (*sub_group_meta_data)[sg_id].n_ranks ;
-                array_id++ ) {
+             array_id++ ) {
             (*list_of_ranks_in_all_subgroups)[offset+array_id]=
                 temp[sg_id-old_sg_size][array_id];
         }
@@ -1508,18 +1103,129 @@ static int get_new_subgroup_data (int32_t *all_selected, int size_of_all_selecte
         /* this causes problems on XT5 starting at 6144 cores */
         free(temp[sg_id-old_sg_size]);
     }
+
     /* clean up temporary storage */
     if(NULL != temp) {
         free(temp);
-        temp = NULL;
     }
 
     /* return */
+ exit_ERROR:
     return rc;
+}
 
-exit_ERROR:
-    return rc;
+static int topo_parse (sub_group_params_t *sub_group_meta_data, int index, int *dst, int *src, int *dst_offset)
+{
+    int src_offset = sub_group_meta_data[index].index_of_first_element;
+    int total_ranks_represented = 0, ranks_represented;
 
+    if (0 == sub_group_meta_data[index].level_in_hierarchy) {
+        ML_VERBOSE(10, ("Copying data for index %d to %d. Ranks at this level: %d\n", index, *dst_offset,
+                        sub_group_meta_data[index].n_ranks));
+
+        /* move level one subgroup data */
+        memmove (dst + *dst_offset, src + src_offset, sizeof (int) * sub_group_meta_data[index].n_ranks);
+
+        /* update the offset of this subgroup since it may have been moved */
+        sub_group_meta_data[index].index_of_first_element = *dst_offset;
+        *dst_offset += sub_group_meta_data[index].n_ranks;
+    }
+
+    ML_VERBOSE(10, ("Subgroup %d has %d ranks. level = %d\n", index, sub_group_meta_data[index].n_ranks,
+                    sub_group_meta_data[index].level_in_hierarchy));
+
+    /* fill in subgroup ranks */
+    sub_group_meta_data[index].rank_data=(rank_properties_t *)
+        malloc(sizeof(rank_properties_t) * sub_group_meta_data[index].n_ranks);
+    if (OPAL_UNLIKELY(NULL == sub_group_meta_data[index].rank_data)) {
+        ML_VERBOSE(10, ("Cannot allocate memory for rank_data \n"));
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    /* recurse on all subgroups */
+    for (int j = 0 ; j < sub_group_meta_data[index].n_ranks ; ++j) {
+        int rank = src[j + src_offset];
+        int next_level;
+
+        /* determine if this rank is the root of the subgroup */
+        if (rank == sub_group_meta_data[index].root_rank_in_comm) {
+            sub_group_meta_data[index].root_index = j;
+        }
+
+        sub_group_meta_data[index].rank_data[j].leaf = true;
+        sub_group_meta_data[index].rank_data[j].rank = rank;
+
+        if (sub_group_meta_data[index].level_in_hierarchy) {
+            ML_VERBOSE(10, ("Looking for subgroup containing %d as root\n", rank));
+
+            for (next_level = index - 1 ; next_level >= 0 ; --next_level) {
+                if (rank == sub_group_meta_data[next_level].root_rank_in_comm) {
+                    ML_VERBOSE(10, ("Subgroup %d has root %d\n", next_level, rank));
+                    break;
+                }
+            }
+
+            /* all ranks are represented in the lowest level. this subgroup is not at the lowest level
+             * so it must be a root at a lower level */
+            assert (next_level >= 0);
+
+            /* not a leaf node */
+            sub_group_meta_data[index].rank_data[j].leaf = false;
+            ranks_represented = topo_parse (sub_group_meta_data, next_level, dst, src, dst_offset);
+            if (0 > ranks_represented) {
+                return ranks_represented;
+            }
+            sub_group_meta_data[index].rank_data[j].num_of_ranks_represented = ranks_represented;
+
+            total_ranks_represented += ranks_represented;
+        } else {
+            /* leaf node */
+            sub_group_meta_data[index].rank_data[j].leaf = true;
+            sub_group_meta_data[index].rank_data[j].num_of_ranks_represented = 1;
+
+            total_ranks_represented++;
+        }
+
+        ML_VERBOSE(10, ("Group %d, level %d, index %d, rank %d represents %d ranks\n", index,
+                        sub_group_meta_data[index].level_in_hierarchy, j, rank,
+                        sub_group_meta_data[index].rank_data[j].num_of_ranks_represented));
+    }
+
+    return total_ranks_represented;
+}
+
+/* put level one in leaf order */
+static int coll_ml_parse_topology (sub_group_params_t *sub_group_meta_data, size_t sub_group_count,
+                                   int *list_of_ranks_in_all_subgroups, int level_one_size)
+{
+    int *tmp_data;
+    int offset, rc;
+
+    tmp_data = calloc (level_one_size, sizeof (int));
+    if (NULL == tmp_data) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    /* do a DFS parse of the topology and ensure that level 1 is in the correct scatter/gather order */
+    offset = 0;
+    rc = topo_parse (sub_group_meta_data, sub_group_count - 1, tmp_data, list_of_ranks_in_all_subgroups, &offset);
+    if (0 > rc) {
+        free (tmp_data);
+        return rc;
+    }
+
+    /* all ranks in level one should be represented in the re-order buffer */
+    assert (offset == level_one_size);
+
+    /* copy re-ordered level 1 (0) */
+    if (0 != offset) {
+        /* copy new level one data back into the list of all subgroups */
+        memmove (list_of_ranks_in_all_subgroups, tmp_data, sizeof (int) * offset);
+    }
+
+    free (tmp_data);
+
+    return OMPI_SUCCESS;
 }
 
 static int append_new_network_context(hierarchy_pairs *pair)
@@ -1573,9 +1279,9 @@ static int ml_module_set_small_msg_thresholds(mca_coll_ml_module_t *ml_module)
 
                 for (j = 0; j < BCOL_NUM_OF_FUNCTIONS; ++j) {
                     if (ml_module->small_message_thresholds[j] >
-                            bcol_module->small_message_thresholds[j]) {
+                        bcol_module->small_message_thresholds[j]) {
                         ml_module->small_message_thresholds[j] =
-                                        bcol_module->small_message_thresholds[j];
+                            bcol_module->small_message_thresholds[j];
                     }
                 }
             }
@@ -1610,7 +1316,7 @@ static int ml_module_set_small_msg_thresholds(mca_coll_ml_module_t *ml_module)
 }
 
 static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module,
-        int n_hierarchies)
+					      int n_hierarchies)
 {
     int i, j,
         ret = OMPI_SUCCESS;
@@ -1680,9 +1386,9 @@ static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module,
      * participating at this level
      */
     ret = comm_allreduce_pml(bcols_in_use, bcols_in_use_all_ranks,
-            n_hierarchies, MPI_INT, ompi_comm_rank(ml_module->comm),
-            MPI_MAX, ompi_comm_size(ml_module->comm),
-            ranks_map, ml_module->comm);
+                             n_hierarchies, MPI_INT, ompi_comm_rank(ml_module->comm),
+                             MPI_MAX, ompi_comm_size(ml_module->comm),
+                             ranks_map, ml_module->comm);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
         ML_VERBOSE(10, ("comm_allreduce_pml failed. bcols_in_use reduction\n"));
         goto exit_ERROR;
@@ -1793,21 +1499,21 @@ static int mca_coll_ml_read_allbcols_settings(mca_coll_ml_module_t *ml_module,
                 continue;
             }
             if (bcol_component->max_frag_size < (int)ml_module->fragment_size)
-            {
-                /* frag size set too large */
-                ml_module->fragment_size = bcol_component->max_frag_size;
-            }
+                {
+                    /* frag size set too large */
+                    ml_module->fragment_size = bcol_component->max_frag_size;
+                }
         }
         /* for non-contigous data - just use the ML buffers */
         ml_module->ml_fragment_size = ml_module->fragment_size;
     }
 
     ML_VERBOSE(10, ("Seting payload size to %d %d [%d %d]",
-                     ml_module->ml_fragment_size, length_ml_payload,
-                     mca_coll_ml_component.payload_buffer_size,
-                     ml_module->data_offset));
+                    ml_module->ml_fragment_size, length_ml_payload,
+                    mca_coll_ml_component.payload_buffer_size,
+                    ml_module->data_offset));
 
-exit_ERROR:
+ exit_ERROR:
     if (NULL != ranks_map) {
         free(ranks_map);
     }
@@ -1832,8 +1538,8 @@ static int ml_discover_hierarchy(mca_coll_ml_module_t *ml_module)
 
     if ((size_bcol_list != size_sbgp_list) || size_sbgp_list < 1 || size_bcol_list < 1) {
         ML_ERROR(("Error: (size of mca_bcol_base_components_in_use = %d)"
-                       " != (size of mca_sbgp_base_components_in_use = %d) or zero.\n",
-                size_bcol_list, size_sbgp_list));
+                  " != (size of mca_sbgp_base_components_in_use = %d) or zero.\n",
+                  size_bcol_list, size_sbgp_list));
         return OMPI_ERROR;
     }
 
@@ -1852,7 +1558,7 @@ static int ml_discover_hierarchy(mca_coll_ml_module_t *ml_module)
     /* Do loop over all supported hiearchies.
        To Do. We would like to have mca parameter that will allow control list
        of topolgies that user would like use. Right now we will run
-     */
+    */
     for (i = 0; i < COLL_ML_TOPO_MAX; i++) {
         if (COLL_ML_TOPO_ENABLED == ml_module->topo_list[i].status) {
             ret = mca_coll_ml_component.topo_discovery_fn[i](ml_module, n_hierarchies);
@@ -1907,9 +1613,9 @@ static int ml_discover_hierarchy(mca_coll_ml_module_t *ml_module)
         }
 
         ret = comm_allreduce_pml(&ret, &i,
-                1, MPI_INT, ompi_comm_rank(ml_module->comm),
-                MPI_MIN, ompi_comm_size(ml_module->comm), comm_ranks,
-                ml_module->comm);
+                                 1, MPI_INT, ompi_comm_rank(ml_module->comm),
+                                 MPI_MIN, ompi_comm_size(ml_module->comm), comm_ranks,
+                                 ml_module->comm);
 
         if (OMPI_SUCCESS != ret) {
             ML_ERROR(("comm_allreduce - failed to collect max_comm data"));
@@ -1922,8 +1628,8 @@ static int ml_discover_hierarchy(mca_coll_ml_module_t *ml_module)
 }
 
 static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
-        mca_coll_ml_topology_t *topo, int n_hierarchies,
-        const char *exclude_sbgp_name, const char *include_sbgp_name)
+						mca_coll_ml_topology_t *topo, int n_hierarchies,
+						const char *exclude_sbgp_name, const char *include_sbgp_name)
 {
     /* local variables */
     char *ptr_output = NULL;
@@ -1933,8 +1639,8 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
 
     mca_sbgp_base_module_t *module = NULL;
     ompi_proc_t **procs = NULL,
-                **copy_procs = NULL,
-                *my_proc = NULL;
+        **copy_procs = NULL,
+        *my_proc = NULL;
 
     const mca_sbgp_base_component_2_0_0_t *sbgp_component = NULL;
 
@@ -1950,7 +1656,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
     int *map_to_comm_ranks = NULL, *bcols_in_use = NULL;
 
     int32_t *all_selected = NULL,
-             *index_proc_selected = NULL;
+        *index_proc_selected = NULL;
 
     short all_reduce_buffer2_in[2];
     short all_reduce_buffer2_out[2];
@@ -1988,10 +1694,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
     ** obtain list of procs
     */
     procs = ml_module->comm->c_local_group->grp_proc_pointers;
-
     /* create private copy for manipulation */
     copy_procs = (ompi_proc_t **) calloc(ompi_comm_size(ml_module->comm),
-                                                    sizeof(ompi_proc_t *));
+                                         sizeof(ompi_proc_t *));
     if (OPAL_UNLIKELY(NULL == copy_procs)) {
         ML_VERBOSE(10, ("Cannot allocate memory.\n"));
         ret = OMPI_ERR_OUT_OF_RESOURCE;
@@ -2067,18 +1772,18 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
     while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){
 
         /*
-         ** obtain the list of  ranks in the current level
-         */
+        ** obtain the list of  ranks in the current level
+        */
 
         sbgp_component = (mca_sbgp_base_component_2_0_0_t *) sbgp_cli->component.cli_component;
 
         /* Skip excluded levels */
         if (NULL != exclude_sbgp_name) {
-            
+
             ML_VERBOSE(10,("EXCLUDE compare %s to %s", include_sbgp_name,
-                       sbgp_component->sbgp_version.mca_component_name));
+                           sbgp_component->sbgp_version.mca_component_name));
             if(0 == strcmp(exclude_sbgp_name,
-                        sbgp_component->sbgp_version.mca_component_name)) {
+                           sbgp_component->sbgp_version.mca_component_name)) {
                 /* take the next element */
                 sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
                 bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
@@ -2088,9 +1793,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
 
         if (NULL != include_sbgp_name) {
             ML_VERBOSE(10,("INCLUDE compare %s to %s", include_sbgp_name,
-                       sbgp_component->sbgp_version.mca_component_name));
+                           sbgp_component->sbgp_version.mca_component_name));
             if(0 != strcmp(include_sbgp_name,
-                        sbgp_component->sbgp_version.mca_component_name)) {
+                           sbgp_component->sbgp_version.mca_component_name)) {
                 /* take the next element */
                 sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
                 bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
@@ -2103,32 +1808,32 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
         /* discover subgroup */
         ML_VERBOSE(10, ("Discover subgroup: hier level - %d.\n", i_hier));
         module = sbgp_component->select_procs(copy_procs, n_procs_in,
-                ml_module->comm,
-                sbgp_cli->key_value, &ptr_output);
+                                              ml_module->comm,
+                                              sbgp_cli->key_value, &ptr_output);
         if (NULL == module) {
             /* no module created */
             n_procs_selected = 0;
-            /* We must continue and participate in the allgather. 
-             * It's not clear that one can enter this conditional 
-             * during "normal" execution. We need to review 
-             * all modules.  
-             */  
+            /* We must continue and participate in the allgather.
+             * It's not clear that one can enter this conditional
+             * during "normal" execution. We need to review
+             * all modules.
+             */
 
-            /* THE CODE SNIPPET COMMENTED OUT BELOW IS DANGEROUS CODE THAT 
-             * COULD RESULT IN A HANG - THE "CONTINUE" STATEMENT MAY RESULT IN 
+            /* THE CODE SNIPPET COMMENTED OUT BELOW IS DANGEROUS CODE THAT
+             * COULD RESULT IN A HANG - THE "CONTINUE" STATEMENT MAY RESULT IN
              * RANKS BYPASSING THE ALLGATHER IN NON-SYMMETRIC CASES
              */
 
             /*
-            sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
-            bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
-            continue;
+              sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
+              bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
+              continue;
             */
-        } else if (
-                (1 == module->group_size) && ( module->group_size != n_procs_in) )
-        {
-            /* we bypass groups of lenth 1, unless those are the only ones
-             * remaining */
+
+            /* Skipping subgroups of size one will cause these processes to be missed in list of level one
+             * indices. */
+        } else if (NULL == module->group_list || (1 == module->group_size && i_hier)) {
+            /* bypass modules that have no group_list */
             n_procs_selected = 0;
             OBJ_RELEASE(module);
             module=NULL;
@@ -2152,7 +1857,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
             for (group_index = 0; group_index < n_procs_selected; group_index++) {
                 /* set my rank within the group */
                 if (map_to_comm_ranks[module->group_list[group_index]] ==
-                        ompi_comm_rank(ml_module->comm)) {
+                    ompi_comm_rank(ml_module->comm)) {
                     my_rank_in_subgroup=group_index;
                     module->my_index = group_index;
                     /* currently the indecies are still given in terms of
@@ -2165,13 +1870,14 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
                 /* I am contributing to this subgroup */
 
 #ifdef NEW_LEADER_SELECTION
+#if 0
                 int lleader_index;
                 /* Select the local leader */
                 lleader_index = coll_ml_select_leader(ml_module,module, map_to_comm_ranks,
-                        copy_procs,n_procs_selected);
+                                                      copy_procs,n_procs_selected);
 
                 local_leader = module->group_list[lleader_index];
-
+#endif
 #else
 
                 /* local leader is rank within list or remaining ranks */
@@ -2179,7 +1885,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
 
 #endif
                 ML_VERBOSE(10,("The local leader selected for hierarchy %d is %d \n",
-                            i_hier, local_leader));
+                               i_hier, local_leader));
 
                 if(local_leader == my_rank_in_remaining_list ) {
 
@@ -2201,8 +1907,8 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
         /* gather the information from all the other remaining ranks */
         ML_VERBOSE(10, ("Call for comm_allreduce_pml.\n"));
         ret = comm_allgather_pml(&in_allgather_value,
-                all_selected, 1, MPI_INT, my_rank_in_list,
-                n_procs_in, map_to_comm_ranks ,ml_module->comm);
+                                 all_selected, 1, MPI_INT, my_rank_in_list,
+                                 n_procs_in, map_to_comm_ranks ,ml_module->comm);
         if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
             ML_VERBOSE(10, ("comm_allreduce_pml failed.\n"));
             goto exit_ERROR;
@@ -2211,7 +1917,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
         /* do some sanity checks */
         if( -1 != my_rank_in_subgroup ) {
             ret = check_global_view_of_subgroups(n_procs_selected,
-                    n_procs_in, ll_p1, all_selected, module );
+                                                 n_procs_in, ll_p1, all_selected, module );
             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                 ML_VERBOSE(10, ("check_global_view_of_subgroups failed.\n"));
                 goto exit_ERROR;
@@ -2219,9 +1925,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
         }
 
         /*
-         ** change the list of procs stored on the module to ranks within
-         ** the communicator.
-         */
+        ** change the list of procs stored on the module to ranks within
+        ** the communicator.
+        */
 
         ML_VERBOSE(10, ("Change the list of procs; hier level - %d.\n", i_hier));
         for (group_index = 0; group_index < n_procs_selected; group_index++) {
@@ -2237,12 +1943,18 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
          */
         /*XXX*/
         ret = get_new_subgroup_data(all_selected, n_procs_in,
-                &array_of_all_subgroup_ranks,
-                &size_of_array_of_all_subgroup_ranks,
-                &list_of_ranks_in_all_subgroups,
-                &size_of_list_of_ranks_in_all_subgroups,
-                &cum_number_ranks_in_all_subgroups,
-                &num_total_subgroups, map_to_comm_ranks,i_hier);
+                                    &array_of_all_subgroup_ranks,
+                                    &size_of_array_of_all_subgroup_ranks,
+                                    &list_of_ranks_in_all_subgroups,
+                                    &size_of_list_of_ranks_in_all_subgroups,
+                                    &cum_number_ranks_in_all_subgroups,
+                                    &num_total_subgroups, map_to_comm_ranks,i_hier);
+
+        /* The way initialization is currently written *all* ranks MUST appear
+         * in the first level (0) of the hierarchy. If any rank is not in the first
+         * level then the calculation of gather/scatter offsets will be wrong.
+         * NTH: DO NOT REMOVE this assert until this changes! */
+        assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
 
         if( OMPI_SUCCESS != ret ) {
             ML_VERBOSE(10, (" Error: get_new_subgroup_data returned %d \n",ret));
@@ -2252,8 +1964,8 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
         /* am I done ? */
         i_am_done=0;
         if ( (all_selected[my_rank_in_list] == ll_p1) &&
-                /* if I was not a member of any group, still need to continue */
-                n_procs_selected ){
+             /* if I was not a member of any group, still need to continue */
+             n_procs_selected ){
             i_am_done = 1;
         }
         /* get my rank in the list */
@@ -2296,8 +2008,7 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
 
             /* create bcol modules */
             ML_VERBOSE(10, ("Create bcol modules.\n"));
-            pair->bcol_modules = pair->bcol_component->collm_comm_query(
-                    module, &pair->num_bcol_modules);
+            pair->bcol_modules = pair->bcol_component->collm_comm_query(module, &pair->num_bcol_modules);
             /* failed to create a new module */
             if (OPAL_UNLIKELY(NULL == pair->bcol_modules)) {
                 ML_VERBOSE(10, ("Failed to create new modules.\n"));
@@ -2328,6 +2039,9 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
 
                 /* set the bcol id */
                 pair->bcol_modules[i]->bcol_id = (int16_t) i_hier;
+
+                /* Set bcol mode bits */
+                topo->all_bcols_mode &= (( mca_bcol_base_module_t *) pair->bcol_modules[i])->supported_mode;
             }
 
             /*
@@ -2346,28 +2060,29 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
         }
 
         /* if n_remain is 1, and the communicator size is not 1, and module
-         ** is not NULL, I am done
-         */
+        ** is not NULL, I am done
+        */
         if ((1 == n_remain) && (1 < original_group_size) &&
-                (NULL != module)) {
+            (NULL != module)) {
             i_am_done = 1;
         }
 
-        n_procs_in = n_remain;
-
         /* am I done ? */
         if (1 == i_am_done) {
             /* nothing more to do */
             goto SelectionDone;
         }
 
+        n_procs_in = n_remain;
+
         /* take the next element */
         sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
         bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
+
         i_hier++;
     }
 
-    SelectionDone:
+ SelectionDone:
 
     if (topo->topo_ordering_info.num_bcols_need_ordering > 0) {
         for (j = 0; j < n_hier; ++j) {
@@ -2380,106 +2095,53 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
         }
     }
 
-    /*
-     * The memory allocation in this debug code is broken,
-     * it is why we keep it disabled by default even for debug mode,
-     * but it is good to have this information
-     */
-#if (OPAL_ENABLE_DEBUG)
-#define COLL_ML_HIER_BUFF_SIZE (1024*1024)
-        {
-            int ii, jj;
-            char buff[COLL_ML_HIER_BUFF_SIZE];
-            char *output = buff;
-
-            memset(buff, 0, COLL_ML_HIER_BUFF_SIZE);
-            for (ii = 0; ii < n_hier; ++ii) {
-                module = topo->component_pairs[ii].subgroup_module;
-                if (NULL != module) {
-                    sprintf(output, "\nsbgp num %d, num of bcol modules %d, my rank in this comm %d, ranks: ",
-                              ii + 1, topo->component_pairs[ii].num_bcol_modules, ompi_comm_rank(ml_module->comm));
-
-                    output = buff + strlen(buff);
-                    assert(COLL_ML_HIER_BUFF_SIZE + buff > output);
-
-                    for(jj = 0; jj < module->group_size; ++jj) {
-                        sprintf(output, " %d", module->group_list[jj]);
-
-                        output = buff + strlen(buff);
-                        assert(COLL_ML_HIER_BUFF_SIZE + buff > output);
-                    }
-
-                    sprintf(output, "\nbcol modules: ");
-
-                    output = buff + strlen(buff);
-                    assert(COLL_ML_HIER_BUFF_SIZE + buff > output);
-
-                    for(jj = 0; jj < topo->component_pairs[ii].num_bcol_modules; ++jj) {
-                        sprintf(output, " %p", (void *)topo->component_pairs[ii].bcol_modules[jj]);
-
-                        output = buff + strlen(buff);
-                        assert(COLL_ML_HIER_BUFF_SIZE + buff > output);
-                    }
-
-                } else {
-                    sprintf(output, "\nsbgp num %d, sbgp module is NULL", ii + 1);
-
-                    output = buff + strlen(buff);
-                    assert(COLL_ML_HIER_BUFF_SIZE + buff > output);
-                }
-            }
-
-            ML_VERBOSE(10, ("\nn_hier = %d\ncommunicator %p, ML module %p%s.\n",
-                                      n_hier, ml_module->comm, ml_module, buff));
-        }
-#endif
-        /* If I was not done, it means that we skipped all subgroups and no hierarchy was build */
-        if (0 == i_am_done) {
-            if (NULL != include_sbgp_name || NULL != exclude_sbgp_name) {
-                /* User explicitly asked for specific type of topology, which generates empty group */
-                ML_ERROR(("ML topology configuration explicitly requested to %s subgroup %s. "
-                           "Such configuration results in a creation of empty groups. As a result, ML framework can't "
-                           "configure requested collective operations. ML framework will be disabled.",
-                            NULL != include_sbgp_name ? "include only" : "exclude",
-                            NULL != include_sbgp_name ? include_sbgp_name : exclude_sbgp_name
-                            ));
-                ret = OMPI_ERROR;
-                goto exit_ERROR;
-            }
-            ML_VERBOSE(10, ("Empty hierarchy..."));
-            ret = OMPI_SUCCESS;
+    /* If I was not done, it means that we skipped all subgroups and no hierarchy was build */
+    if (0 == i_am_done) {
+        if (NULL != include_sbgp_name || NULL != exclude_sbgp_name) {
+            /* User explicitly asked for specific type of topology, which generates empty group */
+            ML_ERROR(("ML topology configuration explicitly requested to %s subgroup %s. "
+                      "Such configuration results in a creation of empty groups. As a result, ML framework can't "
+                      "configure requested collective operations. ML framework will be disabled.",
+                      NULL != include_sbgp_name ? "include only" : "exclude",
+                      NULL != include_sbgp_name ? include_sbgp_name : exclude_sbgp_name
+                      ));
+            ret = OMPI_ERROR;
             goto exit_ERROR;
         }
+        ML_VERBOSE(10, ("Empty hierarchy..."));
+        ret = OMPI_SUCCESS;
+        goto exit_ERROR;
+    }
 
-        topo->n_levels = n_hier;
+    topo->n_levels = n_hier;
 
-        /* Find lowest and highest index of the groups in this communicator.
-        ** This will be needed in deciding where in the hierarchical collective
-        ** sequence of calls these particular groups belong.
-        ** It is done with one allreduce call to save allreduce overhead.
-        */
-        all_reduce_buffer2_in[0] = (short)my_lowest_group_index;
-        all_reduce_buffer2_in[1] = (short)-my_highest_group_index;
-        /* restore map to ranks for the original communicator */
-        for (i = 0; i < ompi_comm_size(ml_module->comm); i++) {
-            map_to_comm_ranks[i] = i;
-        }
+    /* Find lowest and highest index of the groups in this communicator.
+    ** This will be needed in deciding where in the hierarchical collective
+    ** sequence of calls these particular groups belong.
+    ** It is done with one allreduce call to save allreduce overhead.
+    */
+    all_reduce_buffer2_in[0] = (short)my_lowest_group_index;
+    all_reduce_buffer2_in[1] = (short)-my_highest_group_index;
+    /* restore map to ranks for the original communicator */
+    for (i = 0; i < ompi_comm_size(ml_module->comm); i++) {
+        map_to_comm_ranks[i] = i;
+    }
 
-        ret = comm_allreduce_pml(all_reduce_buffer2_in, all_reduce_buffer2_out,
-                                 2, MPI_SHORT, ompi_comm_rank(ml_module->comm),
-                                 MPI_MIN, original_group_size,
-                                 map_to_comm_ranks, ml_module->comm);
-        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
-            ML_VERBOSE(10, ("comm_allreduce_pml failed. all_reduce_buffer2_in reduction\n"));
-            goto exit_ERROR;
-        }
+    ret = comm_allreduce_pml(all_reduce_buffer2_in, all_reduce_buffer2_out,
+                             2, MPI_SHORT, ompi_comm_rank(ml_module->comm),
+                             MPI_MIN, original_group_size,
+                             map_to_comm_ranks, ml_module->comm);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_VERBOSE(10, ("comm_allreduce_pml failed. all_reduce_buffer2_in reduction\n"));
+        goto exit_ERROR;
+    }
 
-        topo->global_lowest_hier_group_index = all_reduce_buffer2_out[0];
-        topo->global_highest_hier_group_index = -all_reduce_buffer2_out[1];
+    topo->global_lowest_hier_group_index = all_reduce_buffer2_out[0];
+    topo->global_highest_hier_group_index = -all_reduce_buffer2_out[1];
 
-        ML_VERBOSE(10, ("The lowest index and highest index was successfully found.\n"));
+    ML_VERBOSE(10, ("The lowest index and highest index was successfully found.\n"));
 
-        ML_VERBOSE(10, ("ml_discover_hierarchy done, n_levels %d lowest_group_index %d highest_group_index %d,"
+    ML_VERBOSE(10, ("ml_discover_hierarchy done, n_levels %d lowest_group_index %d highest_group_index %d,"
                     " original_group_size %d my_lowest_group_index %d my_highest_group_index %d",
                     topo->n_levels, topo->global_lowest_hier_group_index,
                     topo->global_highest_hier_group_index,
@@ -2487,79 +2149,79 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
                     my_lowest_group_index,
                     my_highest_group_index));
 
-        /*
-         * setup detailed subgroup information
-         */
-        ret = ml_setup_full_tree_data(topo, ml_module->comm, my_highest_group_index,
-            map_to_comm_ranks,&num_total_subgroups,&array_of_all_subgroup_ranks,
-            &list_of_ranks_in_all_subgroups);
+    /*
+     * setup detailed subgroup information
+     */
+    ret = ml_setup_full_tree_data(topo, ml_module->comm, my_highest_group_index,
+                                  map_to_comm_ranks,&num_total_subgroups,&array_of_all_subgroup_ranks,
+                                  &list_of_ranks_in_all_subgroups);
 
-        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
-            ML_VERBOSE(10, ("comm_allreduce_pml failed:  bcols_in_use reduction %d \n",ret));
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        ML_VERBOSE(10, ("comm_allreduce_pml failed:  bcols_in_use reduction %d \n",ret));
+        goto exit_ERROR;
+    }
+
+    /* cache the ML hierarchical description on the tree */
+    topo->number_of_all_subgroups = num_total_subgroups;
+    topo->array_of_all_subgroups = array_of_all_subgroup_ranks;
+
+    ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm));
+    /* Set the route table if know-root type of algorithms is used */
+    if (COLL_ML_STATIC_BCAST == mca_coll_ml_component.bcast_algorithm) {
+        ret = mca_coll_ml_fill_in_route_tab(topo, ml_module->comm);
+        if (OMPI_SUCCESS != ret) {
+            ML_ERROR(("mca_coll_ml_fill_in_route_tab returned an error.\n"));
             goto exit_ERROR;
         }
+    }
 
-        /* cache the ML hierarchical description on the tree */
-        topo->number_of_all_subgroups = num_total_subgroups;
-        topo->array_of_all_subgroups = array_of_all_subgroup_ranks;
+    /*
+    ** If all ranks are selected, there will be a single rank that remains -
+    ** the root of the last group.  Check to make sure that all ranks are
+    ** selected, and if not, return an error.  We can't handle the collectives
+    ** correctly with this module.
+    */
 
-        ml_init_k_nomial_trees(topo, list_of_ranks_in_all_subgroups, ompi_comm_rank(ml_module->comm));
-        /* Set the route table if know-root type of algorithms is used */
-        if (mca_coll_ml_component.use_static_bcast) {
-            ret = mca_coll_ml_fill_in_route_tab(topo, ml_module->comm);
-            if (OMPI_SUCCESS != ret) {
-                ML_ERROR(("mca_coll_ml_fill_in_route_tab returned an error.\n"));
-                goto exit_ERROR;
-            }
-        }
+ exit_ERROR:
 
-        /*
-        ** If all ranks are selected, there will be a single rank that remains -
-        ** the root of the last group.  Check to make sure that all ranks are
-        ** selected, and if not, return an error.  We can't handle the collectives
-        ** correctly with this module.
-        */
+    ML_VERBOSE(10, ("Discovery done\n"));
 
-exit_ERROR:
+    /* free temp resources */
+    if (NULL != all_selected) {
+        free(all_selected);
+        all_selected = NULL;
+    }
 
-        ML_VERBOSE(10, ("Discovery done\n"));
+    if (NULL != copy_procs) {
+        free(copy_procs);
+        copy_procs = NULL;
+    }
 
-        /* free temp resources */
-        if (NULL != all_selected) {
-            free(all_selected);
-            all_selected = NULL;
-        }
+    if (NULL != map_to_comm_ranks) {
+        free(map_to_comm_ranks);
+        map_to_comm_ranks = NULL;
+    }
 
-        if (NULL != copy_procs) {
-            free(copy_procs);
-            copy_procs = NULL;
-        }
+    if (NULL != index_proc_selected) {
+        free(index_proc_selected);
+        index_proc_selected = NULL;
+    }
 
-        if (NULL != map_to_comm_ranks) {
-            free(map_to_comm_ranks);
-            map_to_comm_ranks = NULL;
-        }
+    if (NULL != bcols_in_use) {
+        free(bcols_in_use);
+        bcols_in_use = NULL;
+    }
 
-        if (NULL != index_proc_selected) {
-            free(index_proc_selected);
-            index_proc_selected = NULL;
-        }
+    if (NULL != list_of_ranks_in_all_subgroups) {
+        free(list_of_ranks_in_all_subgroups);
+        list_of_ranks_in_all_subgroups = NULL;
+    }
 
-        if (NULL != bcols_in_use) {
-            free(bcols_in_use);
-            bcols_in_use = NULL;
-        }
-
-        if (NULL != list_of_ranks_in_all_subgroups) {
-            free(list_of_ranks_in_all_subgroups);
-            list_of_ranks_in_all_subgroups = NULL;
-        }
-
-        return ret;
+    return ret;
 }
 
 void mca_coll_ml_allreduce_matrix_init(mca_coll_ml_module_t *ml_module,
-                     const mca_bcol_base_component_2_0_0_t *bcol_component)
+				       const mca_bcol_base_component_2_0_0_t *bcol_component)
 {
     int op, dt, et;
 
@@ -2567,22 +2229,22 @@ void mca_coll_ml_allreduce_matrix_init(mca_coll_ml_module_t *ml_module,
         for (dt = 0; dt < OMPI_DATATYPE_MAX_PREDEFINED; ++dt) {
             for (et = 0; et < BCOL_NUM_OF_ELEM_TYPES; ++et) {
                 ml_module->allreduce_matrix[op][dt][et] =
-                           bcol_component->coll_support(op, dt, et);
+                    bcol_component->coll_support(op, dt, et);
             }
         }
     }
 }
 
 int mca_coll_ml_fulltree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
-        int n_hierarchies)
+					     int n_hierarchies)
 {
     return mca_coll_ml_tree_hierarchy_discovery(ml_module,
-            &ml_module->topo_list[COLL_ML_HR_FULL],
-            n_hierarchies, NULL, NULL);
+                                                &ml_module->topo_list[COLL_ML_HR_FULL],
+                                                n_hierarchies, NULL, NULL);
 }
 
 int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
-        int n_hierarchies)
+					      int n_hierarchies)
 {
     mca_base_component_list_item_t *bcol_cli;
     const mca_bcol_base_component_2_0_0_t *bcol_component;
@@ -2591,38 +2253,38 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
     const mca_sbgp_base_component_2_0_0_t *sbgp_component;
 
     sbgp_cli = (sbgp_base_component_keyval_t *)
-              opal_list_get_first(&mca_sbgp_base_components_in_use);
+        opal_list_get_first(&mca_sbgp_base_components_in_use);
 
     for (bcol_cli = (mca_base_component_list_item_t *)
-                  opal_list_get_first(&mca_bcol_base_components_in_use);
-            (opal_list_item_t *) bcol_cli !=
-                    opal_list_get_end(&mca_bcol_base_components_in_use);
-                        bcol_cli = (mca_base_component_list_item_t *)
-                            opal_list_get_next((opal_list_item_t *) bcol_cli),
-                        sbgp_cli = (sbgp_base_component_keyval_t *)
-                            opal_list_get_next((opal_list_item_t *) sbgp_cli)) {
+             opal_list_get_first(&mca_bcol_base_components_in_use);
+         (opal_list_item_t *) bcol_cli !=
+             opal_list_get_end(&mca_bcol_base_components_in_use);
+         bcol_cli = (mca_base_component_list_item_t *)
+             opal_list_get_next((opal_list_item_t *) bcol_cli),
+             sbgp_cli = (sbgp_base_component_keyval_t *)
+             opal_list_get_next((opal_list_item_t *) sbgp_cli)) {
         bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
         if (NULL != bcol_component->coll_support_all_types &&
-                     !bcol_component->coll_support_all_types(BCOL_ALLREDUCE)) {
+            !bcol_component->coll_support_all_types(BCOL_ALLREDUCE)) {
             mca_base_component_list_item_t *bcol_cli_next;
             const mca_bcol_base_component_2_0_0_t *bcol_component_next;
 
             bcol_cli_next = (mca_base_component_list_item_t *)
-                            opal_list_get_next((opal_list_item_t *) bcol_cli);
+                opal_list_get_next((opal_list_item_t *) bcol_cli);
 
             mca_coll_ml_component.need_allreduce_support = true;
             mca_coll_ml_allreduce_matrix_init(ml_module, bcol_component);
 
             sbgp_component = (mca_sbgp_base_component_2_0_0_t *)
-                                    sbgp_cli->component.cli_component;
+                sbgp_cli->component.cli_component;
 
             ML_VERBOSE(10, ("Topology build: sbgp %s will be excluded.",
-                             sbgp_component->sbgp_version.mca_component_name));
+                            sbgp_component->sbgp_version.mca_component_name));
 
             /* If there isn't additional component supports all types => print warning */
             if (1 == opal_list_get_size(&mca_bcol_base_components_in_use) ||
-                  (opal_list_item_t *) bcol_cli_next ==
-                              opal_list_get_end(&mca_bcol_base_components_in_use)) {
+                (opal_list_item_t *) bcol_cli_next ==
+                opal_list_get_end(&mca_bcol_base_components_in_use)) {
                 ML_ERROR(("\n--------------------------------------------------------------------------------\n"
                           "The BCOL component %s doesn't support \n"
                           "all possible tuples (OPERATION X DATATYPE) for Allreduce \n"
@@ -2635,28 +2297,28 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
                           sbgp_component->sbgp_version.mca_component_name));
             } else {
                 bcol_component_next = (mca_bcol_base_component_2_0_0_t *)
-                                               bcol_cli_next->cli_component;
+                    bcol_cli_next->cli_component;
 
                 if (NULL != bcol_component_next->coll_support_all_types &&
-                     !bcol_component_next->coll_support_all_types(BCOL_ALLREDUCE)) {
+                    !bcol_component_next->coll_support_all_types(BCOL_ALLREDUCE)) {
                     ML_ERROR(("\n--------------------------------------------------------------------------------\n"
-                          "The BCOL component %s doesn't support \n"
-                          "all possible tuples for Allreduce. \n"
-                          "While you did provid an additional %s bcol component for alternative topology building, \n"
-                          "this component also lacks support for all tuples. \n"
-                          "As a result, ML Allreduce's behavior is undefined. \n"
-                          "You must provide a component that supports all possible tuples, e.g. \n"
-                          "\"--mca bcol_base_string %s,ptpcoll --mca sbgp_base_subgroups_string %s,p2p\n",
-                          bcol_component->bcol_version.mca_component_name,
-                          bcol_component_next->bcol_version.mca_component_name,
-                          bcol_component->bcol_version.mca_component_name,
-                          sbgp_component->sbgp_version.mca_component_name));
+                              "The BCOL component %s doesn't support \n"
+                              "all possible tuples for Allreduce. \n"
+                              "While you did provid an additional %s bcol component for alternative topology building, \n"
+                              "this component also lacks support for all tuples. \n"
+                              "As a result, ML Allreduce's behavior is undefined. \n"
+                              "You must provide a component that supports all possible tuples, e.g. \n"
+                              "\"--mca bcol_base_string %s,ptpcoll --mca sbgp_base_subgroups_string %s,p2p\n",
+                              bcol_component->bcol_version.mca_component_name,
+                              bcol_component_next->bcol_version.mca_component_name,
+                              bcol_component->bcol_version.mca_component_name,
+                              sbgp_component->sbgp_version.mca_component_name));
                 }
             }
 
             return mca_coll_ml_tree_hierarchy_discovery(ml_module,
-                    &ml_module->topo_list[COLL_ML_HR_ALLREDUCE],
-                    n_hierarchies, sbgp_component->sbgp_version.mca_component_name, NULL);
+                                                        &ml_module->topo_list[COLL_ML_HR_ALLREDUCE],
+                                                        n_hierarchies, sbgp_component->sbgp_version.mca_component_name, NULL);
         }
     }
 
@@ -2664,27 +2326,27 @@ int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
 }
 
 int mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
-        int n_hierarchies)
+								  int n_hierarchies)
 {
     return mca_coll_ml_tree_hierarchy_discovery(ml_module,
-            &ml_module->topo_list[COLL_ML_HR_NBS],
-            n_hierarchies, "basesmsocket", NULL);
+                                                &ml_module->topo_list[COLL_ML_HR_NBS],
+                                                n_hierarchies, "basesmsocket", NULL);
 }
 
 int mca_coll_ml_fulltree_ptp_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
-        int n_hierarchies)
+						      int n_hierarchies)
 {
     return mca_coll_ml_tree_hierarchy_discovery(ml_module,
-            &ml_module->topo_list[COLL_ML_HR_SINGLE_PTP],
-            n_hierarchies, NULL, "p2p");
+                                                &ml_module->topo_list[COLL_ML_HR_SINGLE_PTP],
+                                                n_hierarchies, NULL, "p2p");
 }
 
 int mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
-        int n_hierarchies)
+							    int n_hierarchies)
 {
     return mca_coll_ml_tree_hierarchy_discovery(ml_module,
-            &ml_module->topo_list[COLL_ML_HR_SINGLE_IBOFFLOAD],
-            n_hierarchies, NULL, "ibnet");
+                                                &ml_module->topo_list[COLL_ML_HR_SINGLE_IBOFFLOAD],
+                                                n_hierarchies, NULL, "ibnet");
 }
 
 #define IS_RECHABLE 1
@@ -2722,7 +2384,7 @@ static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, ompi_comm
     }
 
     topo->route_vector = (mca_coll_ml_route_info_t *)
-                                calloc(comm_size, sizeof(mca_coll_ml_route_info_t));
+        calloc(comm_size, sizeof(mca_coll_ml_route_info_t));
     if (NULL == topo->route_vector) {
         ML_VERBOSE(10, ("Cannot allocate memory.\n"));
         rc = OMPI_ERR_OUT_OF_RESOURCE;
@@ -2748,12 +2410,12 @@ static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, ompi_comm
         }
 
         rc = comm_allreduce_pml(all_reachable_ranks,
-                route_table[level],
-                comm_size,
-                MPI_INT, sbgp_group->my_index,
-                MPI_MAX, sbgp_group->group_size,
-                sbgp_group->group_list,
-                comm);
+                                route_table[level],
+                                comm_size,
+                                MPI_INT, sbgp_group->my_index,
+                                MPI_MAX, sbgp_group->group_size,
+                                sbgp_group->group_list,
+                                comm);
         if (OMPI_SUCCESS != rc) {
             ML_VERBOSE(10, ("comm_allreduce failed.\n"));
             goto exit_ERROR;
@@ -2761,7 +2423,7 @@ static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, ompi_comm
 
         for (i = 0; i < comm_size; ++i) {
             if (IS_NOT_RECHABLE !=
-                         route_table[level][i]) {
+                route_table[level][i]) {
                 all_reachable_ranks[i] = IS_RECHABLE;
             }
         }
@@ -2773,7 +2435,7 @@ static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, ompi_comm
        reach them through leader of my upper layer */
     for (i = 0; i < comm_size; ++i) {
         if (IS_NOT_RECHABLE ==
-                   route_table[level - 1][i]) {
+            route_table[level - 1][i]) {
             route_table[level - 1][i] = 0;
         }
     }
@@ -2843,8 +2505,8 @@ static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, ompi_comm
 
         for(ii = 0; ii < comm_size; ++ii) {
             sprintf(output, " (%d, %d)",
-                            topo->route_vector[ii].level,
-                            topo->route_vector[ii].rank);
+                    topo->route_vector[ii].level,
+                    topo->route_vector[ii].rank);
 
             output = buff + strlen(buff);
             assert(COLL_ML_ROUTE_BUFF_SIZE + buff > output);
@@ -2862,7 +2524,7 @@ static int mca_coll_ml_fill_in_route_tab(mca_coll_ml_topology_t *topo, ompi_comm
 
     return OMPI_SUCCESS;
 
-exit_ERROR:
+ exit_ERROR:
 
     ML_VERBOSE(10, ("Exit with error status - %d.\n", rc));
     if (NULL != route_table) {
@@ -2891,7 +2553,7 @@ static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module)
     mca_coll_base_module_2_0_0_t *coll_base = &ml_module->super;
 
     int iboffload_used =
-            mca_coll_ml_check_if_bcol_is_used("iboffload", ml_module, COLL_ML_TOPO_MAX);
+        mca_coll_ml_check_if_bcol_is_used("iboffload", ml_module, COLL_ML_TOPO_MAX);
 
     /* initialize coll component function pointers */
     coll_base->coll_module_enable = ml_module_enable;
@@ -2901,29 +2563,26 @@ static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module)
         coll_base->coll_allgather = NULL;
         coll_base->coll_iallgather = NULL;
     } else {
-        coll_base->coll_allgather = NULL;
-        coll_base->coll_iallgather = NULL;
+        coll_base->coll_allgather = mca_coll_ml_allgather;
+        coll_base->coll_iallgather = mca_coll_ml_allgather_nb;
     }
 
     coll_base->coll_allgatherv = NULL;
 
     if (mca_coll_ml_component.use_knomial_allreduce) {
         if (true == mca_coll_ml_component.need_allreduce_support) {
-            coll_base->coll_allreduce = NULL;
+            coll_base->coll_allreduce = mca_coll_ml_allreduce_dispatch;
+            coll_base->coll_iallreduce = mca_coll_ml_allreduce_dispatch_nb;
         } else {
-            coll_base->coll_allreduce = NULL;
+            coll_base->coll_allreduce = mca_coll_ml_allreduce;
+            coll_base->coll_iallreduce = mca_coll_ml_allreduce_nb;
         }
     } else {
         coll_base->coll_allreduce = NULL;
     }
 
-    if (mca_coll_ml_component.disable_alltoall) {
-        coll_base->coll_alltoall = NULL;
-        coll_base->coll_ialltoall = NULL;
-    } else {
-        coll_base->coll_alltoall = NULL;
-        coll_base->coll_ialltoall = NULL;
-    }
+    coll_base->coll_alltoall = NULL;
+    coll_base->coll_ialltoall = NULL;
 
     coll_base->coll_alltoallv  = NULL;
     coll_base->coll_alltoallw  = NULL;
@@ -2931,7 +2590,7 @@ static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module)
     coll_base->coll_barrier = mca_coll_ml_barrier_intra;
 
     /* Use the sequential broadcast */
-    if (mca_coll_ml_component.use_sequential_bcast) {
+    if (COLL_ML_SEQ_BCAST == mca_coll_ml_component.bcast_algorithm) {
         coll_base->coll_bcast = mca_coll_ml_bcast_sequential_root;
     } else {
         coll_base->coll_bcast = mca_coll_ml_parallel_bcast;
@@ -2939,6 +2598,9 @@ static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module)
 
     coll_base->coll_exscan     = NULL;
     coll_base->coll_gather     = NULL;
+    /*
+      coll_base->coll_gather     = mca_coll_ml_gather;
+    */
     /* Current iboffload/ptpcoll version have no support for gather */
     if (iboffload_used  ||
         mca_coll_ml_check_if_bcol_is_used("ptpcoll", ml_module, COLL_ML_TOPO_MAX)) {
@@ -2947,8 +2609,11 @@ static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module)
 
 
     coll_base->coll_gatherv    = NULL;
-
-    coll_base->coll_reduce     = NULL;
+    if (mca_coll_ml_component.disable_reduce) {
+        coll_base->coll_reduce     = NULL;
+    } else {
+        coll_base->coll_reduce     = mca_coll_ml_reduce;
+    }
     coll_base->coll_reduce_scatter = NULL;
     coll_base->coll_scan       = NULL;
     coll_base->coll_scatter    = NULL;
@@ -2958,7 +2623,6 @@ static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module)
     coll_base->coll_scatterv   = NULL;
 
     coll_base->coll_iallgatherv = NULL;
-    coll_base->coll_iallreduce  = NULL;
     coll_base->coll_ialltoallv  = NULL;
     coll_base->coll_ialltoallw  = NULL;
     coll_base->coll_ibarrier    = mca_coll_ml_ibarrier_intra;
@@ -2967,7 +2631,7 @@ static void init_coll_func_pointers(mca_coll_ml_module_t *ml_module)
     coll_base->coll_iexscan     = NULL;
     coll_base->coll_igather     = NULL;
     coll_base->coll_igatherv    = NULL;
-    coll_base->coll_ireduce     = NULL;
+    coll_base->coll_ireduce     = mca_coll_ml_reduce_nb;
     coll_base->coll_ireduce_scatter = NULL;
     coll_base->coll_iscan       = NULL;
     coll_base->coll_iscatter    = NULL;
@@ -2993,11 +2657,11 @@ static int init_lists(mca_coll_ml_module_t *ml_module)
 
     length = sizeof(mca_coll_ml_descriptor_t);
     ret = ompi_free_list_init_ex_new(&(ml_module->message_descriptors), length,
-            opal_cache_line_size, OBJ_CLASS(mca_coll_ml_descriptor_t),
-            length_payload, 0,
-            num_elements, max_elements, elements_per_alloc,
-            NULL,
-            init_ml_message_desc, ml_module);
+                                     opal_cache_line_size, OBJ_CLASS(mca_coll_ml_descriptor_t),
+                                     length_payload, 0,
+                                     num_elements, max_elements, elements_per_alloc,
+                                     NULL,
+                                     init_ml_message_desc, ml_module);
     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
         ML_ERROR(("ompi_free_list_init_ex_new exit with error"));
         return ret;
@@ -3013,11 +2677,11 @@ static int init_lists(mca_coll_ml_module_t *ml_module)
     /*length_payload=sizeof(something);*/
     length = sizeof(mca_coll_ml_fragment_t);
     ret = ompi_free_list_init_ex_new(&(ml_module->fragment_descriptors), length,
-            opal_cache_line_size, OBJ_CLASS(mca_coll_ml_fragment_t),
-            length_payload, 0,
-            num_elements, max_elements, elements_per_alloc,
-            NULL,
-            init_ml_fragment_desc, ml_module);
+                                     opal_cache_line_size, OBJ_CLASS(mca_coll_ml_fragment_t),
+                                     length_payload, 0,
+                                     num_elements, max_elements, elements_per_alloc,
+                                     NULL,
+                                     init_ml_fragment_desc, ml_module);
     if (OMPI_SUCCESS != ret) {
         ML_ERROR(("ompi_free_list_init_ex_new exit with error"));
         return ret;
@@ -3042,16 +2706,16 @@ static int check_for_max_supported_ml_modules(struct ompi_communicator_t *comm)
     }
 
     ret = comm_allreduce_pml(&cs->max_comm, &cs->max_comm,
-            1 , MPI_INT, ompi_comm_rank(comm),
-            MPI_MIN, ompi_comm_size(comm), comm_ranks,
-            comm);
+                             1 , MPI_INT, ompi_comm_rank(comm),
+                             MPI_MIN, ompi_comm_size(comm), comm_ranks,
+                             comm);
     if (OMPI_SUCCESS != ret) {
         ML_ERROR(("comm_allreduce - failed to collect max_comm data"));
         return ret;
     }
 
     if (0 >= cs->max_comm ||
-            ompi_comm_size(comm) < cs->min_comm_size) {
+        ompi_comm_size(comm) < cs->min_comm_size) {
         return OMPI_ERROR;
     } else {
         --cs->max_comm;
@@ -3063,18 +2727,18 @@ static int check_for_max_supported_ml_modules(struct ompi_communicator_t *comm)
 }
 
 #if OPAL_ENABLE_DEBUG
-#define DEBUG_ML_COMM_QUERY()                                                                   \
-    do {                                                                                        \
-        static int verbosity_level = 5;                                                         \
-        static int module_num = 0;                                                              \
-        ML_VERBOSE(10, ("ML module - %p num %d for comm - %p, "                                 \
-                    "comm size - %d, ML component prio - %d.\n",                                \
-                    ml_module, ++module_num, comm, ompi_comm_size(comm), *priority));           \
-        /* For now I want to always print that we enter ML -                                    \
+#define DEBUG_ML_COMM_QUERY()						\
+    do {                                                                \
+        static int verbosity_level = 5;					\
+        static int module_num = 0;                                      \
+        ML_VERBOSE(10, ("ML module - %p num %d for comm - %p, "		\
+                        "comm size - %d, ML component prio - %d.\n",	\
+                        ml_module, ++module_num, comm, ompi_comm_size(comm), *priority)); \
+        /* For now I want to always print that we enter ML -		\
            at the past there was an issue that we did not enter ML and actually run with tuned. \
-           Still I do not want to print it for each module - only for the first. */             \
-        ML_VERBOSE(verbosity_level, ("ML module - %p was successfully created", ml_module));    \
-        verbosity_level = 10;                                                                   \
+           Still I do not want to print it for each module - only for the first. */ \
+        ML_VERBOSE(verbosity_level, ("ML module - %p was successfully created", ml_module)); \
+        verbosity_level = 10;						\
     } while(0)
 
 #else
@@ -3087,14 +2751,14 @@ static int mca_coll_ml_need_multi_topo(int bcol_collective)
     const mca_bcol_base_component_2_0_0_t *bcol_component;
 
     for (bcol_cli = (mca_base_component_list_item_t *)
-                  opal_list_get_first(&mca_bcol_base_components_in_use);
-            (opal_list_item_t *) bcol_cli !=
-                    opal_list_get_end(&mca_bcol_base_components_in_use);
-                        bcol_cli = (mca_base_component_list_item_t *)
-                            opal_list_get_next((opal_list_item_t *) bcol_cli)) {
+             opal_list_get_first(&mca_bcol_base_components_in_use);
+         (opal_list_item_t *) bcol_cli !=
+             opal_list_get_end(&mca_bcol_base_components_in_use);
+         bcol_cli = (mca_base_component_list_item_t *)
+             opal_list_get_next((opal_list_item_t *) bcol_cli)) {
         bcol_component = (mca_bcol_base_component_2_0_0_t *) bcol_cli->cli_component;
         if (NULL != bcol_component->coll_support_all_types &&
-                     !bcol_component->coll_support_all_types(bcol_collective)) {
+            !bcol_component->coll_support_all_types(bcol_collective)) {
             return true;
         }
     }
@@ -3109,11 +2773,11 @@ static int setup_bcast_table(mca_coll_ml_module_t *module)
     bool has_zero_copy;
 
     /* setup bcast index table */
-    if (cm->use_static_bcast) {
+    if (COLL_ML_STATIC_BCAST == cm->bcast_algorithm) {
         module->bcast_fn_index_table[0] = ML_BCAST_SMALL_DATA_KNOWN;
 
-	has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY &
-			   module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_KNOWN]->topo_info->all_bcols_mode);
+        has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY &
+                           module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_KNOWN]->topo_info->all_bcols_mode);
 
         if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) {
             module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_KNOWN;
@@ -3127,14 +2791,14 @@ static int setup_bcast_table(mca_coll_ml_module_t *module)
     } else {
         module->bcast_fn_index_table[0] = ML_BCAST_SMALL_DATA_UNKNOWN;
 
-	if (NULL == module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]) {
-	    ML_ERROR(("ML couldn't be used: because the mca param coll_ml_use_static_bcast was set "
-		      "to zero and no function is available."));
-	    return OMPI_ERROR;
-	}
+        if (NULL == module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]) {
+            ML_ERROR(("ML couldn't be used: because the mca param coll_ml_bcast_algorithm was not set "
+                      "to static and no function is available."));
+            return OMPI_ERROR;
+        }
 
-	has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY &
-			   module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]->topo_info->all_bcols_mode);
+        has_zero_copy = !!(MCA_BCOL_BASE_ZERO_COPY &
+                           module->coll_ml_bcast_functions[ML_BCAST_LARGE_DATA_UNKNOWN]->topo_info->all_bcols_mode);
 
         if (1 == cm->enable_fragmentation || (2 == cm->enable_fragmentation && !has_zero_copy)) {
             module->bcast_fn_index_table[1] = ML_BCAST_SMALL_DATA_UNKNOWN;
@@ -3200,6 +2864,10 @@ static void setup_default_topology_map(mca_coll_ml_module_t *ml_module)
         ml_module->collectives_topology_map[ML_ALLREDUCE][ML_LARGE_DATA_EXTRA_TOPO_ALLREDUCE] = COLL_ML_HR_ALLREDUCE;
     }
 
+    ml_module->collectives_topology_map[ML_REDUCE][ML_SMALL_DATA_REDUCE]    = COLL_ML_HR_FULL;
+    ml_module->collectives_topology_map[ML_REDUCE][ML_LARGE_DATA_REDUCE]    = COLL_ML_HR_FULL;
+
+
     ml_module->collectives_topology_map[ML_SCATTER][ML_SCATTER_SMALL_DATA_KNOWN]  = COLL_ML_HR_FULL;
     ml_module->collectives_topology_map[ML_SCATTER][ML_SCATTER_N_DATASIZE_BINS]   = COLL_ML_HR_FULL;
     ml_module->collectives_topology_map[ML_SCATTER][ML_SCATTER_SMALL_DATA_UNKNOWN]    = COLL_ML_HR_FULL;
@@ -3229,8 +2897,8 @@ static void load_cached_config(mca_coll_ml_module_t *ml_module)
 }
 
 /* Pasha: In future I would suggest to convert this configuration to some sophisticated mca parameter or
- even configuration file. On this stage of project I will set it statically and later we will change it
- to run time parameter */
+   even configuration file. On this stage of project I will set it statically and later we will change it
+   to run time parameter */
 static void setup_topology_coll_map(mca_coll_ml_module_t *ml_module)
 {
     /* Load default topology setup */
@@ -3266,6 +2934,12 @@ mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority)
         return NULL;
     }
 
+    if (!ompi_rte_proc_is_bound) {
+        /* do not enable coll/ml unless this process is bound (for now) */
+        *priority = -1;
+        return NULL;
+    }
+
     /**
      * If it is inter-communicator and size is less than 2 we have specialized modules
      * to handle the intra collective communications.
@@ -3330,8 +3004,8 @@ mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority)
     /* gvm Disabled for debuggin */
     ret = mca_coll_ml_build_filtered_fn_table(ml_module);
     if (OMPI_SUCCESS != ret) {
-         ML_ERROR(("mca_coll_ml_build_filtered_fn_table returned an error.\n"));
-         goto CLEANUP;
+        ML_ERROR(("mca_coll_ml_build_filtered_fn_table returned an error.\n"));
+        goto CLEANUP;
     }
 
     /* Generate active bcols list */
@@ -3379,10 +3053,10 @@ mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority)
         }
 
         ml_module->brucks_buffer_threshold_const =
-                (comm_size / 2 + comm_size % 2) * (log_comm_size) ;
+            (comm_size / 2 + comm_size % 2) * (log_comm_size) ;
 
 
-       ml_module->log_comm_size = log_comm_size;
+        ml_module->log_comm_size = log_comm_size;
     }
 
     if (iboffload_was_requested) {
@@ -3401,7 +3075,7 @@ mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority)
          !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
          Create connection for service barrier and memory address exchange
          for ml buffers and asyc service barrier
-         */
+        */
         ret = mca_coll_ml_memsync_intra(ml_module, 0);
         if (OMPI_SUCCESS != ret) {
             goto CLEANUP;
@@ -3414,7 +3088,7 @@ mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority)
 
     return &(ml_module->super);
 
-CLEANUP:
+ CLEANUP:
     /* Vasily: RLG:  Need to cleanup free lists */
     if (NULL != ml_module) {
         OBJ_RELEASE(ml_module);
@@ -3423,19 +3097,47 @@ CLEANUP:
     return NULL;
 }
 
+/* copied slightly modified from coll/hcoll */
+#define ML_SAVE_FALLBACK(_coll_ml, _coll)                               \
+    do {                                                                \
+        _coll_ml->fallback.coll_ ## _coll = comm->c_coll.coll_ ## _coll;    \
+        _coll_ml->fallback.coll_ ## _coll ## _module = comm->c_coll.coll_ ## _coll ## _module; \
+        if (comm->c_coll.coll_ ## _coll && comm->c_coll.coll_ ## _coll ## _module) { \
+            OBJ_RETAIN(_coll_ml->fallback.coll_ ## _coll ## _module);   \
+        }                                                               \
+    } while(0)
+
+static void ml_save_fallback_colls (mca_coll_ml_module_t *coll_ml,
+				    struct ompi_communicator_t *comm)
+{
+    memset (&coll_ml->fallback, 0, sizeof (coll_ml->fallback));
+    /* save lower-priority collectives to handle cases not yet handled
+     * by coll/ml */
+    ML_SAVE_FALLBACK(coll_ml, allreduce);
+    ML_SAVE_FALLBACK(coll_ml, allgather);
+    ML_SAVE_FALLBACK(coll_ml, reduce);
+    ML_SAVE_FALLBACK(coll_ml, ibcast);
+    ML_SAVE_FALLBACK(coll_ml, iallreduce);
+    ML_SAVE_FALLBACK(coll_ml, iallgather);
+    ML_SAVE_FALLBACK(coll_ml, ireduce);
+    ML_SAVE_FALLBACK(coll_ml, ibcast);
+}
+
 /*
  * Init module on the communicator
  */
 static int
 ml_module_enable(mca_coll_base_module_t *module,
-                         struct ompi_communicator_t *comm)
+		 struct ompi_communicator_t *comm)
 {
     /* local variables */
     char output_buffer[2 * MPI_MAX_OBJECT_NAME];
 
+    ml_save_fallback_colls ((mca_coll_ml_module_t *) module, comm);
+
     memset(&output_buffer[0], 0, sizeof(output_buffer));
     snprintf(output_buffer, sizeof(output_buffer), "%s (cid %d)", comm->c_name,
-                       comm->c_contextid);
+             comm->c_contextid);
 
     ML_VERBOSE(10, ("coll:ml:enable: new communicator: %s.\n", output_buffer));
 
@@ -3449,7 +3151,6 @@ OBJ_CLASS_INSTANCE(mca_coll_ml_module_t,
                    mca_coll_ml_module_destruct);
 
 OBJ_CLASS_INSTANCE(mca_coll_ml_collective_operation_progress_t,
-        ompi_request_t,
-        mca_coll_ml_collective_operation_progress_construct,
-        mca_coll_ml_collective_operation_progress_destruct);
-
+		   ompi_request_t,
+		   mca_coll_ml_collective_operation_progress_construct,
+		   mca_coll_ml_collective_operation_progress_destruct);
diff --git a/ompi/mca/coll/ml/coll_ml_reduce.c b/ompi/mca/coll/ml/coll_ml_reduce.c
new file mode 100644
index 0000000000..1a7d1c2e40
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_reduce.c
@@ -0,0 +1,528 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
+ * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/** @file */
+
+#include "ompi_config.h"
+
+#include "ompi/constants.h"
+#include "opal/threads/mutex.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/bcol/bcol.h"
+#include "opal/sys/atomic.h"
+#include "ompi/mca/coll/ml/coll_ml.h"
+#include "ompi/mca/coll/ml/coll_ml_allocation.h"
+#include "ompi/mca/coll/ml/coll_ml_inlines.h"
+#define REDUCE_SMALL_MESSAGE_THRESHOLD 2048
+
+static int mca_coll_ml_reduce_unpack(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    int ret;
+    /* need to put in more */
+    int count = coll_op->variable_fn_params.count;
+    ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype;
+
+    void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
+            (uintptr_t)coll_op->fragment_data.offset_into_user_buffer);
+    void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
+            (size_t)coll_op->variable_fn_params.rbuf_offset);
+
+    ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest,
+            (char *) src);
+    if (ret < 0) {
+        return OMPI_ERROR;
+    }
+
+    if (coll_op->variable_fn_params.root_flag) {
+        ML_VERBOSE(1,("In reduce unpack %d",
+            *(int *)((unsigned char*) src)));
+    }
+
+    ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, sbuf val %lf, rbuf addr %p, rbuf offset %d, rbuf val %lf.",
+                coll_op->variable_fn_params.sbuf, coll_op->variable_fn_params.sbuf_offset,
+                *(double *) ((unsigned char *) coll_op->variable_fn_params.sbuf +
+                    (size_t) coll_op->variable_fn_params.sbuf_offset),
+                coll_op->variable_fn_params.rbuf, coll_op->variable_fn_params.rbuf_offset,
+                *(double *) ((unsigned char *) coll_op->variable_fn_params.rbuf +
+                    (size_t) coll_op->variable_fn_params.rbuf_offset)));
+
+    return OMPI_SUCCESS;
+}
+
+
+static int
+mca_coll_ml_reduce_task_setup (mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    int fn_idx, h_level, next_h_level, my_index;
+    mca_sbgp_base_module_t *sbgp;
+    mca_coll_ml_topology_t *topo = coll_op->coll_schedule->topo_info;
+
+    fn_idx      = coll_op->sequential_routine.current_active_bcol_fn;
+    h_level     = coll_op->coll_schedule->component_functions[fn_idx].h_level;
+    next_h_level = (fn_idx < coll_op->coll_schedule->n_fns - 1) ?
+        coll_op->coll_schedule->component_functions[fn_idx+1].h_level : -1;
+    sbgp        = topo->component_pairs[h_level].subgroup_module;
+    my_index    = sbgp->my_index;
+
+    if (coll_op->variable_fn_params.root_flag) {
+        ML_VERBOSE(1,("In task completion Data in receiver buffer %d ",
+            *(int *)((unsigned char*) coll_op->variable_fn_params.rbuf +
+            coll_op->variable_fn_params.rbuf_offset)));
+    }
+
+    /* determine the root for this level of the hierarchy */
+    if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == next_h_level ||
+        coll_op->global_root == sbgp->group_list[my_index]) {
+        /* I am the global root or I will be talking to the global root in the next round. */
+        coll_op->variable_fn_params.root = my_index;
+    } else if (coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].level == h_level) {
+        /* the root is in this level of my hierarchy */
+        coll_op->variable_fn_params.root = coll_op->coll_schedule->topo_info->route_vector[coll_op->global_root].rank;
+    } else {
+        coll_op->variable_fn_params.root = 0;
+    }
+
+    /* Set the route vector for this root */
+    coll_op->variable_fn_params.root_route =
+        &coll_op->coll_schedule->topo_info->route_vector[sbgp->group_list[coll_op->variable_fn_params.root]];
+
+    /* Am I the root of this hierarchy? */
+    coll_op->variable_fn_params.root_flag = (my_index == coll_op->variable_fn_params.root);
+
+    /* For hierarchy switch btw source and destination buffer
+     * No need to make this switch for the first call ..
+     * */
+    if (0 < fn_idx) {
+        int tmp_offset = coll_op->variable_fn_params.sbuf_offset;
+        coll_op->variable_fn_params.sbuf_offset =
+            coll_op->variable_fn_params.rbuf_offset;
+        coll_op->variable_fn_params.rbuf_offset = tmp_offset;
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static int mca_coll_ml_reduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
+{
+    /* local variables */
+    void *buf;
+
+    size_t dt_size;
+    int ret, frag_len, count;
+
+    ptrdiff_t lb, extent;
+
+    ml_payload_buffer_desc_t *src_buffer_desc;
+    mca_coll_ml_collective_operation_progress_t *new_op;
+
+    mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
+
+    ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent);
+    if (ret < 0) {
+     return OMPI_ERROR;
+    }
+
+    dt_size = (size_t) extent;
+
+    /* Keep the pipeline filled with fragments */
+    while (coll_op->fragment_data.message_descriptor->n_active <
+        coll_op->fragment_data.message_descriptor->pipeline_depth) {
+        /* If an active fragment happens to have completed the collective during
+         * a hop into the progress engine, then don't launch a new fragment,
+         * instead break and return.
+         */
+        if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
+            == coll_op->fragment_data.message_descriptor->n_bytes_total) {
+            break;
+        }
+
+        /* Get an ml buffer */
+        src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
+        if (NULL == src_buffer_desc) {
+            /* If there exist outstanding fragments, then break out
+             * and let an active fragment deal with this later,
+             * there are no buffers available.
+             */
+            if (0 < coll_op->fragment_data.message_descriptor->n_active) {
+                return OMPI_SUCCESS;
+            } else {
+                /* It is useless to call progress from here, since
+                 * ml progress can't be executed as result ml memsync
+                 * call will not be completed and no memory will be
+                 * recycled. So we put the element on the list, and we will
+                 * progress it later when memsync will recycle some memory*/
+
+                /* The fragment is already on list and
+                 * the we still have no ml resources
+                 * Return busy */
+                if (coll_op->pending & REQ_OUT_OF_MEMORY) {
+                    ML_VERBOSE(10,("Out of resources %p", coll_op));
+                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+                }
+
+                coll_op->pending |= REQ_OUT_OF_MEMORY;
+                opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
+                        (opal_list_item_t *)coll_op);
+                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
+                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
+            }
+        }
+
+        /* Get a new collective descriptor and initialize it */
+        new_op =  mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_reduce_functions[ML_SMALL_DATA_REDUCE],
+                coll_op->fragment_data.message_descriptor->src_user_addr,
+                coll_op->fragment_data.message_descriptor->dest_user_addr,
+                coll_op->fragment_data.message_descriptor->n_bytes_total,
+                coll_op->fragment_data.message_descriptor->n_bytes_scheduled);
+
+        ML_VERBOSE(1,(" In Reduce fragment progress %d %d ",
+                    coll_op->fragment_data.message_descriptor->n_bytes_total,
+                    coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
+        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
+                src_buffer_desc->buffer_index, src_buffer_desc);
+
+        new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
+        new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;
+
+        /* set the task setup callback  */
+        new_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup;
+        /* We need this address for pointer arithmetic in memcpy */
+        buf = (void*)coll_op->fragment_data.message_descriptor->src_user_addr;
+        /* calculate the number of data types in this packet */
+        count = (coll_op->fragment_data.message_descriptor->n_bytes_total -
+                coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
+                ((size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_REDUCE]/4 )?
+                (coll_op->fragment_data.message_descriptor->n_bytes_total -
+                coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size :
+                (size_t) coll_op->variable_fn_params.count);
+
+        /* calculate the fragment length */
+        frag_len = count * dt_size;
+
+        ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count,
+                (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t)
+                    coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
+        if (ret < 0) {
+            return OMPI_ERROR;
+        }
+
+        /* if root unpack the data */
+        if (ompi_comm_rank(ml_module->comm) == coll_op->global_root ) {
+            new_op->process_fn = mca_coll_ml_reduce_unpack;
+            new_op->variable_fn_params.root_flag = true;
+        } else {
+            new_op->process_fn = NULL;
+            new_op->variable_fn_params.root_flag = false;
+        }
+
+        new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route;
+
+        /* Setup fragment specific data */
+        new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
+        new_op->fragment_data.buffer_desc = src_buffer_desc;
+        new_op->fragment_data.fragment_size = frag_len;
+        (new_op->fragment_data.message_descriptor->n_active)++;
+
+        /* Set in Reduce Buffer arguments */
+        ML_SET_VARIABLE_PARAMS_BCAST(new_op, OP_ML_MODULE(new_op), count,
+                                     coll_op->variable_fn_params.dtype, src_buffer_desc,
+                                     0, (ml_module->payload_block->size_buffer -
+                                         ml_module->data_offset)/2, frag_len,
+                                     src_buffer_desc->data_addr);
+
+        new_op->variable_fn_params.buffer_size = frag_len;
+        new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
+        new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
+        new_op->variable_fn_params.root = coll_op->variable_fn_params.root;
+        new_op->global_root = coll_op->global_root;
+        new_op->variable_fn_params.op = coll_op->variable_fn_params.op;
+        new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
+        new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
+        MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);
+
+        ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d\n",
+                    new_op->variable_fn_params.buffer_size,
+                    new_op->fragment_data.fragment_size,
+                    new_op->fragment_data.message_descriptor->n_bytes_scheduled));
+        /* initialize first coll */
+        new_op->sequential_routine.seq_task_setup(new_op);
+
+        /* append this collective !! */
+        OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
+        opal_list_append(&mca_coll_ml_component.sequential_collectives,
+                (opal_list_item_t *)new_op);
+        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
+
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static inline __opal_attribute_always_inline__
+int parallel_reduce_start (void *sbuf, void *rbuf, int count,
+                           struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+                           int root,
+                           struct ompi_communicator_t *comm,
+                           mca_coll_ml_module_t *ml_module,
+                           ompi_request_t **req,
+                           int small_data_reduce,
+                           int large_data_reduce) {
+    ptrdiff_t lb, extent;
+    size_t pack_len, dt_size;
+    ml_payload_buffer_desc_t *src_buffer_desc = NULL;
+    mca_coll_ml_collective_operation_progress_t * coll_op = NULL;
+    bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count);
+    mca_coll_ml_component_t *cm = &mca_coll_ml_component;
+    int ret, n_fragments = 1, frag_len,
+        pipeline_depth, n_dts_per_frag, rank;
+
+    if (MPI_IN_PLACE == sbuf) {
+        sbuf = rbuf;
+    }
+
+    ret = ompi_datatype_get_extent(dtype, &lb, &extent);
+    if (ret < 0) {
+        return OMPI_ERROR;
+    }
+
+    rank = ompi_comm_rank (comm);
+
+    dt_size = (size_t) extent;
+    pack_len = count * dt_size;
+
+    /* We use a separate recieve and send buffer so only half the buffer is usable. */
+    if (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) {
+        /* The len of the message can not be larger than ML buffer size */
+        assert(pack_len <= ml_module->payload_block->size_buffer);
+
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+
+        ML_VERBOSE(10,("Using small data reduce (threshold = %d)",
+                                        REDUCE_SMALL_MESSAGE_THRESHOLD));
+        while (NULL == src_buffer_desc) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_reduce_functions[small_data_reduce],
+                sbuf, rbuf, pack_len, 0);
+
+        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
+                src_buffer_desc->buffer_index, src_buffer_desc);
+
+        coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
+        coll_op->variable_fn_params.src_desc = src_buffer_desc;
+        coll_op->variable_fn_params.count = count;
+
+        ret = ompi_datatype_copy_content_same_ddt(dtype, count,
+                (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf);
+        if (ret < 0){
+            return OMPI_ERROR;
+        }
+
+    } else if (cm->enable_fragmentation || !contiguous) {
+        ML_VERBOSE(1,("Using Fragmented Reduce "));
+
+        /* fragment the data */
+        /* check for retarded application programming decisions */
+        if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) {
+            ML_ERROR(("Sorry, but we don't support datatypes that large"));
+            return OMPI_ERROR;
+        }
+
+        /* calculate the number of data types that can fit per ml-buffer */
+        n_dts_per_frag = ml_module->small_message_thresholds[BCOL_REDUCE] / (4 * dt_size);
+
+        /* calculate the number of fragments */
+        n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */
+
+        /* calculate the actual pipeline depth */
+        pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth;
+
+        /* calculate the fragment size */
+        frag_len = n_dts_per_frag * dt_size;
+
+        /* allocate an ml buffer */
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        while (NULL == src_buffer_desc) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_reduce_functions[small_data_reduce],
+                sbuf,rbuf,
+                pack_len,
+                0 /* offset for first pack */);
+
+        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
+                src_buffer_desc->buffer_index, src_buffer_desc);
+
+
+        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
+        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;
+
+        coll_op->fragment_data.message_descriptor->n_active = 1;
+        coll_op->full_message.n_bytes_scheduled = frag_len;
+        coll_op->full_message.fragment_launcher = mca_coll_ml_reduce_frag_progress;
+        coll_op->full_message.pipeline_depth = pipeline_depth;
+        coll_op->fragment_data.current_coll_op = small_data_reduce;
+        coll_op->fragment_data.fragment_size = frag_len;
+
+        coll_op->variable_fn_params.count = n_dts_per_frag;  /* seems fishy */
+        coll_op->variable_fn_params.buffer_size = frag_len;
+        coll_op->variable_fn_params.src_desc = src_buffer_desc;
+        /* copy into the ml-buffer */
+        ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag,
+                (char *) src_buffer_desc->data_addr, (char *) sbuf);
+        if (ret < 0) {
+            return OMPI_ERROR;
+        }
+    } else {
+        ML_VERBOSE(1,("Using zero-copy ptp reduce"));
+        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
+                ml_module->coll_ml_reduce_functions[large_data_reduce],
+                sbuf, rbuf, pack_len, 0);
+
+        coll_op->variable_fn_params.userbuf =
+            coll_op->variable_fn_params.sbuf = sbuf;
+
+        coll_op->variable_fn_params.rbuf = rbuf;
+
+        /* The ML buffer is used for testing. Later, when we
+         * switch to use knem/mmap/portals this should be replaced
+         * appropriately
+         */
+        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        while (NULL == src_buffer_desc) {
+            opal_progress();
+            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
+        }
+
+        coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
+        coll_op->variable_fn_params.src_desc = src_buffer_desc;
+        coll_op->variable_fn_params.count = count;
+    }
+
+    coll_op->process_fn = (rank != root) ? NULL : mca_coll_ml_reduce_unpack;
+
+    /* Set common parts */
+    coll_op->fragment_data.buffer_desc = src_buffer_desc;
+    coll_op->variable_fn_params.dtype = dtype;
+    coll_op->variable_fn_params.op = op;
+
+    /* NTH: the root, root route, and root flag are set in the task setup */
+
+    /* Fill in the function arguments */
+    coll_op->variable_fn_params.sbuf_offset = 0;
+    coll_op->variable_fn_params.rbuf_offset = (ml_module->payload_block->size_buffer -
+                               ml_module->data_offset)/2;
+
+    /* Keep track of the global root of this operation */
+    coll_op->global_root = root;
+
+    coll_op->variable_fn_params.sequence_num =
+        OPAL_THREAD_ADD64(&(ml_module->collective_sequence_num), 1);
+    coll_op->sequential_routine.current_active_bcol_fn = 0;
+    /* set the task setup callback  */
+    coll_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup;
+
+    /* Reduce requires the schedule to be fixed. If we use other (changing) schedule,
+       the operation might result in different result. */
+    coll_op->coll_schedule->component_functions = coll_op->coll_schedule->
+        comp_fn_arr[coll_op->coll_schedule->topo_info->route_vector[root].level];
+
+    /* Launch the collective */
+    ret = mca_coll_ml_launch_sequential_collective (coll_op);
+    if (OMPI_SUCCESS != ret) {
+        ML_VERBOSE(10, ("Failed to launch reduce collective"));
+        return ret;
+    }
+
+    *req = &coll_op->full_message.super;
+
+    return OMPI_SUCCESS;
+}
+
+
+int mca_coll_ml_reduce(void *sbuf, void *rbuf, int count,
+        struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+        int root, struct ompi_communicator_t *comm,
+        mca_coll_base_module_t *module) {
+
+    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
+    int ret = OMPI_SUCCESS;
+    ompi_request_t *req;
+
+    if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
+        fprintf (stderr, "Falling back for reduce\n");
+        /* coll/ml does not handle non-communative operations at this time. fallback
+         * on another collective module */
+        return ml_module->fallback.coll_reduce (sbuf, rbuf, count, dtype, op, root, comm,
+                                                ml_module->fallback.coll_reduce_module);
+    }
+
+    ML_VERBOSE(10,("Calling Ml Reduce "));
+    ret = parallel_reduce_start(sbuf, rbuf, count, dtype, op,
+                           root, comm, (mca_coll_ml_module_t *)module,
+                           &req, ML_SMALL_DATA_REDUCE,
+                           ML_LARGE_DATA_REDUCE);
+    if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) {
+        ML_VERBOSE(10, ("Failed to launch"));
+        return ret;
+    }
+
+    /* Blocking reduce */
+    ret = ompi_request_wait(&req, MPI_STATUS_IGNORE);
+
+    ML_VERBOSE(10, ("Blocking Reduce is done"));
+
+    return ret;
+}
+
+
+int mca_coll_ml_reduce_nb(void *sbuf, void *rbuf, int count,
+        struct ompi_datatype_t *dtype, struct ompi_op_t *op,
+        int root, struct ompi_communicator_t *comm,
+        ompi_request_t **req,
+        mca_coll_base_module_t *module) {
+
+    int ret = OMPI_SUCCESS;
+    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t*)module;
+
+    if (OPAL_UNLIKELY(!ompi_op_is_commute(op))) {
+        fprintf (stderr, "Falling back for ireduce\n");
+        /* coll/ml does not handle non-communative operations at this time. fallback
+         * on another collective module */
+        return ml_module->fallback.coll_ireduce (sbuf, rbuf, count, dtype, op, root, comm, req,
+                                                 ml_module->fallback.coll_ireduce_module);
+    }
+
+    ML_VERBOSE(10,("Calling Ml Reduce "));
+    ret = parallel_reduce_start(sbuf, rbuf, count, dtype, op,
+                           root, comm, ml_module,
+                           req, ML_SMALL_DATA_REDUCE,
+                           ML_LARGE_DATA_REDUCE);
+    if (OPAL_UNLIKELY(ret != OMPI_SUCCESS)) {
+        ML_VERBOSE(10, ("Failed to launch"));
+        return ret;
+    }
+
+
+    ML_VERBOSE(10, ("Non-blocking Reduce is done"));
+
+    return OMPI_SUCCESS;
+
+}
diff --git a/ompi/mca/coll/ml/coll_ml_resource_affinity.c b/ompi/mca/coll/ml/coll_ml_resource_affinity.c
new file mode 100644
index 0000000000..e3bea016ad
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_resource_affinity.c
@@ -0,0 +1,147 @@
+#include "opal/mca/carto/carto.h"
+#include "opal/mca/carto/base/base.h"
+#include "opal/util/output.h"
+#include "opal/class/opal_graph.h"
+#include "opal/mca/paffinity/base/base.h"
+#include "ompi/constants.h"
+
+#include "orte/mca/ess/ess.h"
+#include "coll_ml_resource_affinity.h"
+
+int get_dev_distance_for_all_procs(opal_carto_graph_t *graph, const char *device)
+{
+    opal_paffinity_base_cpu_set_t cpus;
+    opal_carto_base_node_t *device_node;
+    int min_distance = -1, i, num_processors;
+
+    if(opal_paffinity_base_get_processor_info(&num_processors) != OMPI_SUCCESS) {
+        num_processors = 100; /* Choose something big enough */
+    }
+
+    device_node = opal_carto_base_find_node(graph, device);
+
+    /* no topology info for device found. Assume that it is close */
+    if(NULL == device_node)
+        return 0;
+
+    OPAL_PAFFINITY_CPU_ZERO(cpus);
+    opal_paffinity_base_get(&cpus);
+
+    for (i = 0; i < num_processors; i++) {
+        opal_carto_base_node_t *slot_node;
+        int distance, socket, core;
+        char *slot;
+
+        if(!OPAL_PAFFINITY_CPU_ISSET(i, cpus))
+            continue;
+
+        opal_paffinity_base_get_map_to_socket_core(i, &socket, &core);
+        asprintf(&slot, "socket%d", socket);
+
+        slot_node = opal_carto_base_find_node(graph, slot);
+
+        free(slot);
+
+        if(NULL == slot_node)
+            return 0;
+
+        distance = opal_carto_base_spf(graph, slot_node, device_node);
+
+        if(distance < 0)
+            return 0;
+
+        if(min_distance < 0 || min_distance > distance)
+            min_distance = distance;
+    }
+
+    return min_distance;
+}
+
+int get_dev_distance_proc(opal_carto_graph_t *graph,
+				const char *device,int rank, struct ompi_proc_t *proc){
+    opal_paffinity_base_cpu_set_t cpus;
+    opal_carto_base_node_t *device_node;
+	opal_carto_base_node_t *slot_node;
+    int distance, socket, core;
+    char *slot;
+	int process_id;
+	int nrank;
+
+	nrank = orte_ess.get_node_rank(&(proc->proc_name));
+
+	opal_paffinity_base_get_physical_processor_id(nrank, &process_id);
+
+	device_node = opal_carto_base_find_node(graph, device);
+
+    /* no topology info for device found. Assume that it is close */
+    if(NULL == device_node)
+        return 0;
+
+    OPAL_PAFFINITY_CPU_ZERO(cpus);
+    opal_paffinity_base_get(&cpus);
+
+
+
+    opal_paffinity_base_get_map_to_socket_core(process_id, &socket, &core);
+    asprintf(&slot, "socket%d", socket);
+	ML_VERBOSE(10,("The socket addres is %d\n",socket));
+
+    slot_node = opal_carto_base_find_node(graph, slot);
+
+    free(slot);
+
+    if(NULL == slot_node)
+            return -1;
+
+    distance = opal_carto_base_spf(graph, slot_node, device_node);
+
+    if(distance < 0)
+	return -1;
+
+    return distance;
+
+}
+
+int coll_ml_select_leader(mca_coll_ml_module_t *ml_module,
+						  mca_sbgp_base_module_t *sbgp_module,
+							int *rank_in_comm,
+							struct ompi_proc_t ** procs,
+							int nprocs){
+
+	int rank, dist1, dist2,dist;
+	int min_dist = 10000;
+	int i,leader = 10000;
+	struct ompi_proc_t *proc = NULL;
+
+	for (i=0; i<nprocs; i++) {
+
+		/* if local process */
+		rank = rank_in_comm[sbgp_module->group_list[i]];
+		proc = procs[sbgp_module->group_list[i]];
+		dist1 = get_dev_distance_proc(ml_module->sm_graph,"mem0",rank,proc);
+		dist2 = get_dev_distance_proc(ml_module->ib_graph,"mthca0",rank,proc);
+
+		dist = dist1 + dist2;
+
+		ML_VERBOSE(10,("The distance for proc %d dist1 %d, dist2 %d \n",i,dist1,dist2));
+		if ((dist < min_dist) || ((dist == min_dist) && (i < leader))) {
+			leader = i;
+			min_dist = dist;
+		}
+	}
+
+	return leader;
+}
+
+
+int coll_ml_construct_resource_graphs(mca_coll_ml_module_t *ml_module){
+
+	opal_carto_base_get_host_graph(&ml_module->sm_graph,"Memory");
+	opal_carto_base_get_host_graph(&ml_module->ib_graph,"Infiniband");
+
+	/* debug
+	opal_graph_print(ml_module->sm_graph);
+	*/
+	return 0;
+
+}
diff --git a/ompi/mca/coll/ml/coll_ml_resource_affinity.h b/ompi/mca/coll/ml/coll_ml_resource_affinity.h
new file mode 100644
index 0000000000..c64c214ee0
--- /dev/null
+++ b/ompi/mca/coll/ml/coll_ml_resource_affinity.h
@@ -0,0 +1,19 @@
+#include "opal/mca/carto/carto.h"
+#include "opal/mca/carto/base/base.h"
+#include "opal/util/output.h"
+#include "opal/class/opal_graph.h"
+#include "coll_ml.h"
+
+
+/* Get the host graph for SM and Infiniband */
+int discover_on_node_resources(const char device);
+int get_dev_distance_for_all_procs(opal_carto_graph_t *graph,
+							const char *device);
+int get_dev_distance_proc(opal_carto_graph_t *graph,
+				const char *device,int rank,struct ompi_proc_t *proc);
+int coll_ml_select_leader(mca_coll_ml_module_t *ml_module,
+						  mca_sbgp_base_module_t *sbgp_module,
+							int *rank_in_comm,
+							struct ompi_proc_t ** procs,
+							int nprocs);
+int coll_ml_construct_resource_graphs(mca_coll_ml_module_t *ml_module);
diff --git a/ompi/mca/coll/ml/coll_ml_select.c b/ompi/mca/coll/ml/coll_ml_select.c
index 5c4091fd87..ae838828b2 100644
--- a/ompi/mca/coll/ml/coll_ml_select.c
+++ b/ompi/mca/coll/ml/coll_ml_select.c
@@ -138,7 +138,7 @@ static int add_to_invoke_table(mca_bcol_base_module_t *bcol_module,
                for (i=range_min; i<=range_max; i++) {
                     bcol_module->filtered_fns_table[data_src_type][waiting_semantic][bcoll_type][i][j][k] 
                                                                     = fn_filtered; 
-                    ML_VERBOSE(11, ("Putting functions %d %d %d %d %p", bcoll_type, i, j, k, fn_filtered));
+                    ML_VERBOSE(21, ("Putting functions %d %d %d %d %p", bcoll_type, i, j, k, fn_filtered));
                }
             }
         }
@@ -323,10 +323,8 @@ int mca_select_bcol_function(mca_bcol_base_module_t *bcol_module,
                             bcol_fn_arguments->dtype); 
     if ((BCOL_ALLREDUCE == bcoll_type) || (BCOL_REDUCE == bcoll_type)) {
         /* needs to be resolved, the op structure has changed, there is no field called "op_type" */
-        /*
         fn_filtered =
             bcol_module->filtered_fns_table[data_src_type][waiting_type][bcoll_type][msg_range][bcol_fn_arguments->dtype->id][bcol_fn_arguments->op->op_type];
-       */     
     }
     else {
         fn_filtered =
diff --git a/ompi/mca/coll/ml/mca-coll-ml.config b/ompi/mca/coll/ml/mca-coll-ml.config
index 81f517966d..bdf43792b0 100644
--- a/ompi/mca/coll/ml/mca-coll-ml.config
+++ b/ompi/mca/coll/ml/mca-coll-ml.config
@@ -127,6 +127,28 @@ hierarchy = full_hr
 algorithm = ML_LARGE_DATA_ALLREDUCE
 hierarchy = full_hr
 
+[REDUCE]
+<small>
+# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL
+algorithm = ML_SMALL_DATA_REDUCE
+hierarchy = full_hr
+<large>
+# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL
+algorithm = ML_LARGE_DATA_REDUCE
+hierarchy = full_hr
+
+[IREDUCE]
+<small>
+# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL
+algorithm = ML_SMALL_DATA_REDUCE
+hierarchy = full_hr
+<large>
+# scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL
+algorithm = ML_LARGE_DATA_REDUCE
+hierarchy = full_hr
+
+
+
 [SCATTER]
 <small>
 # scatter supports: ML_SCATTER_SMALL_DATA_SEQUENTIAL
diff --git a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_component.c b/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_component.c
index 5f02747de4..769d843392 100644
--- a/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_component.c
+++ b/ompi/mca/sbgp/basesmsocket/sbgp_basesmsocket_component.c
@@ -282,6 +282,10 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
     module->super.group_net = OMPI_SBGP_SOCKET;
 
     /* test to see if process is bound */
+    /* this needs to change. Process may have been bound by an
+     * some entity other than OPAL. Just because the binding
+     * policy  isn't set doesn't mean it's not bound
+     */
     if( OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) ) {
 
         /* pa affinity not set, so socket index will be set to -1 */
@@ -331,7 +335,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
     end debug print*/
 
     /* if no other local procs found skip to end */
-    if( 1 >= cnt ) {
+    if( 1 > cnt ) {
       goto NoLocalPeers;
     }
 
@@ -419,7 +423,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
 
 #if 0
     /*debug print*/
-    
+    /*
     {
         int ii;
         fprintf(stderr,"Ranks per socket: %d\n",cnt);
@@ -429,8 +433,8 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
         fprintf(stderr,"\n");
         fflush(stderr);
     }
+    */
 #endif
-
    /* end debug*/
 
 
diff --git a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_component.c b/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_component.c
index 0532d8e1a3..6fcffedb88 100644
--- a/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_component.c
+++ b/ompi/mca/sbgp/basesmuma/sbgp_basesmuma_component.c
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -146,7 +149,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmuma_select_procs(struct ompi_proc_
         )
 {
     /* local variables */
-    int cnt,proc,local;
+    int cnt,proc,local,last_local_proc;
     mca_sbgp_basesmuma_module_t *module;
 
     module=OBJ_NEW(mca_sbgp_basesmuma_module_t);
@@ -157,27 +160,21 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmuma_select_procs(struct ompi_proc_
     module->super.group_comm = comm;
     module->super.group_list = NULL;
     module->super.group_net = OMPI_SBGP_MUMA;
-    cnt=0;
-    for( proc=0 ; proc < n_procs_in ; proc++) {
-        /* debug
-        if( odd ) {
-            if( !(1&proc) )
-                continue;
-        } else {
-            if( (1&proc) )
-                continue;
-        }
-         end debug */
-        local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags);
-        if( local ) {
+    for (proc = 0, cnt = 0, last_local_proc = 0 ; proc < n_procs_in ; ++proc) {
+        local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags);
+        if (local) {
+            last_local_proc = proc;
             cnt++;
         }
     }
     /* if no other local procs found skip to end */
+
     if( 2 > cnt ) {
         /* There's always at least one - namely myself */
-        assert( 1 == cnt);
-        module->super.group_size=1;
+        assert(1 == cnt);
+        module->super.group_size = 1;
+        module->super.group_list = (int *) malloc (sizeof (int));
+        module->super.group_list[0] = last_local_proc;
         /* let ml handle this case */
         goto OneLocalPeer;
     }
@@ -190,21 +187,11 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmuma_select_procs(struct ompi_proc_
             goto Error;
         }
     }
-    cnt=0;
-    for( proc=0 ; proc < n_procs_in ; proc++) {
-        /* debug
-        if( odd ) {
-            if( !(1&proc) )
-                continue;
-        } else {
-            if( (1&proc) )
-                continue;
-        }
-         end debug */
-        local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags);
+
+    for (proc = 0, cnt = 0 ; proc < n_procs_in ; ++proc) {
+        local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags);
         if( local ) {
-            module->super.group_list[cnt]=proc;
-            cnt++;
+            module->super.group_list[cnt++] = proc;
         }
     }
 OneLocalPeer:
diff --git a/ompi/mca/sbgp/p2p/sbgp_p2p_component.c b/ompi/mca/sbgp/p2p/sbgp_p2p_component.c
index 4029d82edb..1f506ea712 100644
--- a/ompi/mca/sbgp/p2p/sbgp_p2p_component.c
+++ b/ompi/mca/sbgp/p2p/sbgp_p2p_component.c
@@ -1,6 +1,9 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
  * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
+ * Copyright (c) 2014      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -141,103 +144,73 @@ static mca_sbgp_base_module_t * mca_sbgp_p2p_select_procs(struct ompi_proc_t **
         )
 {
     /* local variables */
-    int cnt,proc;
+    int cnt, proc, my_rank;
     mca_sbgp_p2p_module_t *module;
-    int my_rank,i_btl;
-
-    module=OBJ_NEW(mca_sbgp_p2p_module_t);
-    if (!module ) {
-        return NULL;
-    }
-    module->super.group_size=0;
-    module->super.group_comm = comm;
-    module->super.group_net = OMPI_SBGP_P2P;
 
     /* find my rank in the group */
-    my_rank=-1;
-    for( proc=0 ; proc < n_procs_in ; proc++) {
-        if(ompi_proc_local() == procs[proc]) {
-            my_rank=proc;
+    for (my_rank = -1, proc = 0 ; proc < n_procs_in ; ++proc) {
+        if (ompi_proc_local() == procs[proc]) {
+            my_rank = proc;
         }
     }
 
     /* I am not in the list - so will form no local subgroup */
-   if( 0 > my_rank ){
-       return NULL;
-   }
-
-   /* count the number of ranks in the group */
-   cnt=0;
-    for( proc=0 ; proc < n_procs_in ; proc++) {
-        mca_bml_base_endpoint_t* endpoint = 
-            (mca_bml_base_endpoint_t*) procs[proc]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-
-        if(my_rank == proc ) {
-            cnt++;
-            continue;
-        }
-        /* loop over btls */
-        for( i_btl=0 ; i_btl < (int) mca_bml_base_btl_array_get_size(&(endpoint->btl_eager)) ; i_btl++ ) {
-            if(key) {
-                /* I am checking for specific btl */
-                if( strcmp(endpoint->btl_eager.bml_btls[i_btl].btl->btl_component->btl_version.mca_component_name,key)) {
-                    cnt++;
-                    break;
-
-                }
-            } else {
-                /* I will take any btl */
-                cnt++;
-                break;
-            }
-        }
+    if (0 > my_rank) {
+        return NULL;
     }
-/* debug
-fprintf(stderr," AAA cnt %d n_procs_in %d \n",cnt,n_procs_in);
-fflush(stderr);
- end debug */
+
+    module = OBJ_NEW(mca_sbgp_p2p_module_t);
+    if (!module ) {
+        return NULL;
+    }
+
+    module->super.group_size = 0;
+    module->super.group_comm = comm;
+    module->super.group_net = OMPI_SBGP_P2P;
 
     /* allocate resources */
-    module->super.group_size=cnt;
-    if( cnt > 0 ) {
-        module->super.group_list=(int *)malloc(sizeof(int)*
-                module->super.group_size);
-        if( NULL == module->super.group_list ) {
-            goto Error;
-        }
+    module->super.group_list = (int *) calloc (n_procs_in, sizeof (int));
+    if (NULL == module->super.group_list) {
+        goto Error;
     }
 
-   cnt=0;
-    for( proc=0 ; proc < n_procs_in ; proc++) {
-        mca_bml_base_endpoint_t* endpoint = 
+    for (cnt = 0, proc = 0 ; proc < n_procs_in ; ++proc) {
+#if defined(OMPI_PROC_ENDPOINT_TAG_BML)
+        mca_bml_base_endpoint_t* endpoint =
             (mca_bml_base_endpoint_t*) procs[proc]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+#endif
 
-        if(my_rank == proc ) {
-            module->super.group_list[cnt]=proc;
-            cnt++;
+        if (my_rank == proc || !key) {
+            module->super.group_list[cnt++] = proc;
             continue;
         }
-        /* loop over btls */
 
-        for( i_btl=0 ;
-                i_btl < (int) mca_bml_base_btl_array_get_size(&(endpoint->btl_eager)) ;
-                i_btl++ ) {
-            if(key) {
+#if defined(OMPI_PROC_ENDPOINT_TAG_BML)
+        if (NULL != endpoint) {
+            int num_btls = mca_bml_base_btl_array_get_size(&(endpoint->btl_eager));
+            /* loop over btls */
+
+            for (int i_btl = 0 ; num_btls ; ++i_btl) {
                 /* I am checking for specific btl */
-                if( strcmp(endpoint->btl_eager.bml_btls[i_btl].btl->
-                            btl_component->btl_version.mca_component_name,key)) {
-                    module->super.group_list[cnt]=proc;
-                    cnt++;
+                if (strcmp(endpoint->btl_eager.bml_btls[i_btl].btl->
+                           btl_component->btl_version.mca_component_name, key)) {
+                    module->super.group_list[cnt++] = proc;
                     break;
-
                 }
-            } else {
-                /* I will take any btl */
-                module->super.group_list[cnt]=proc;
-                cnt++;
-                break;
             }
         }
+#endif
+    }
+
+    if (0 == cnt) {
+	goto Error;
+    }
+
+    module->super.group_size = cnt;
+    module->super.group_list = (int *) realloc (module->super.group_list, sizeof (int) * cnt);
+    if (NULL == module->super.group_list) {
+        /* Shouldn't ever happen */
+        goto Error;
     }
 
     /* successful return */
@@ -246,9 +219,9 @@ fflush(stderr);
     /* return with error */
 Error:
     /* clean up */
-    if( NULL != module->super.group_list ) {
-        free(module->super.group_list);
-        module->super.group_list=NULL;
+    if (NULL != module->super.group_list) {
+        free (module->super.group_list);
+        module->super.group_list = NULL;
     }
     OBJ_RELEASE(module);
 
diff --git a/ompi/op/op.c b/ompi/op/op.c
index 6d5e2cd9e9..97cd708eb0 100644
--- a/ompi/op/op.c
+++ b/ompi/op/op.c
@@ -257,6 +257,25 @@ int ompi_op_init(void)
         add_intrinsic(&ompi_mpi_op_replace.op, OMPI_OP_BASE_FORTRAN_REPLACE,
                       FLAGS, "MPI_REPLACE")) {
         return OMPI_ERROR;
+    }else{
+/* This code is placed back here to support
+ * HCOL allreduce at the moment. It is a part of bgate repository only. This conflict with OMPI v1.7
+ * is to be resolved some other way.
+ * */
+        ompi_mpi_op_null.op.op_type = OMPI_OP_NULL;
+        ompi_mpi_op_max.op.op_type = OMPI_OP_MAX;
+        ompi_mpi_op_min.op.op_type = OMPI_OP_MIN;
+        ompi_mpi_op_sum.op.op_type = OMPI_OP_SUM;
+        ompi_mpi_op_prod.op.op_type = OMPI_OP_PROD;
+        ompi_mpi_op_land.op.op_type = OMPI_OP_LAND;
+        ompi_mpi_op_band.op.op_type = OMPI_OP_BAND;
+        ompi_mpi_op_lor.op.op_type = OMPI_OP_LOR;
+        ompi_mpi_op_bor.op.op_type = OMPI_OP_BOR;
+        ompi_mpi_op_lxor.op.op_type = OMPI_OP_LXOR;
+        ompi_mpi_op_bxor.op.op_type = OMPI_OP_BXOR;
+        ompi_mpi_op_maxloc.op.op_type = OMPI_OP_MAXLOC;
+        ompi_mpi_op_minloc.op.op_type = OMPI_OP_MINLOC;
+        ompi_mpi_op_replace.op.op_type = OMPI_OP_REPLACE;
     }
 
     /* All done */
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 93c80cf1b4..b6e2b84d74 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -109,6 +109,29 @@ typedef void (ompi_op_java_handler_fn_t)(void *, void *, int *,
 #define OMPI_OP_FLAGS_COMMUTE      0x0040
 
 
+
+
+/* This enum as well as op_type field in ompi_op_t is placed back here to support
+ * HCOL allreduce at the moment. It is a part of bgate repository only. This conflict with OMPI v1.7
+ * is to be resolved some other way.
+ * */
+enum ompi_op_type {
+    OMPI_OP_NULL,
+    OMPI_OP_MAX,
+    OMPI_OP_MIN,
+    OMPI_OP_SUM,
+    OMPI_OP_PROD,
+    OMPI_OP_LAND,
+    OMPI_OP_BAND,
+    OMPI_OP_LOR,
+    OMPI_OP_BOR,
+    OMPI_OP_LXOR,
+    OMPI_OP_BXOR,
+    OMPI_OP_MAXLOC,
+    OMPI_OP_MINLOC,
+    OMPI_OP_REPLACE,
+    OMPI_OP_NUM_OF_TYPES
+};
 /**
  * Back-end type of MPI_Op
  */
@@ -119,6 +142,8 @@ struct ompi_op_t {
     /** Name, for debugging purposes */
     char o_name[MPI_MAX_OBJECT_NAME];
 
+    enum ompi_op_type op_type;
+
     /** Flags about the op */
     uint32_t o_flags;