bml: Add support for dynamically calling add_procs

This commit contains the following changes: - bml: add a function to add a single process. this function is intended to remove the need to maintain a opal_bitmap_t as it is irrelevant for a single proc. BTLs will need to be updated to either 1) ignore the return code from opal_bitmap_set_bit or not call the function if the reachability bitmap is NULL. - bml: add an inline accessor function for getting the bml endpoint for a peer proc. this function will either 1) return the cached bml endpoint, or 2) create the endpoint and call add_proc will all available BTL modules. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2015-08-31 14:57:55 -06:00 · 2015-08-31 14:57:55 -06:00 · 6fa6513003
--- a/ompi/mca/bml/base/base.h
+++ b/ompi/mca/bml/base/base.h
@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
@ -10,6 +11,8 @@
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC.  All rights
+ *                         reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -25,6 +28,7 @@
 #include "ompi/mca/mca.h"
 #include "opal/mca/base/mca_base_framework.h"
 #include "ompi/mca/bml/bml.h"
+#include "ompi/proc/proc.h"


 /*
@ -60,6 +64,14 @@ OMPI_DECLSPEC extern mca_bml_base_component_t mca_bml_component;
 OMPI_DECLSPEC extern mca_bml_base_module_t mca_bml;
 OMPI_DECLSPEC extern mca_base_framework_t ompi_bml_base_framework;

+static inline struct mca_bml_base_endpoint_t *mca_bml_base_get_endpoint (struct ompi_proc_t *proc) {
+    if (OPAL_UNLIKELY(NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML])) {
+        mca_bml.bml_add_proc (proc);
+    }
+
+    return (struct mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+}
+

 END_C_DECLS
 #endif /* MCA_BML_BASE_H */
--- a/ompi/mca/bml/bml.h
+++ b/ompi/mca/bml/bml.h
@ -160,14 +160,11 @@ static inline bool mca_bml_base_btl_array_remove( mca_bml_base_btl_array_t* arra
 */
 static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_btl_array_t* array, size_t item_index)
 {
-#if OPAL_ENABLE_DEBUG
-    if(item_index >= array->arr_size) {
-        opal_output(0, "mca_bml_base_btl_array_get_index: invalid array index %lu >= %lu",
-                    (unsigned long)item_index, (unsigned long)array->arr_size);
-        return 0;
+    if (item_index < array->arr_size) {
+        return &array->bml_btls[item_index];
    }
-#endif
-    return &array->bml_btls[item_index];
+
+    return NULL;
 }

 /**
@ -441,7 +438,7 @@ typedef int (*mca_bml_base_module_finalize_fn_t)( void );
 * @return                    OMPI_SUCCESS or error status on failure.
 *
 * The mca_bml_base_module_add_procs_fn_t() is called by the PML to
- * determine the set of BMLs that should be used to reach each process.
+ * determine the set of BTLs that should be used to reach each process.
 * Any addressing information exported by the peer via the mca_base_modex_send()
 * function should be available during this call via the corresponding
 * mca_base_modex_recv() function. The BML may utilize this information to
@ -465,6 +462,25 @@ typedef int (*mca_bml_base_module_add_procs_fn_t)(
                                                  struct opal_bitmap_t* reachable
                                                  );

+/**
+ * PML->BML notification of change in the process list.
+ *
+ * @param proc (IN)           Process
+ * @return                    OMPI_SUCCESS or error status on failure.
+ *
+ * The mca_bml_base_module_add_proc_fn_t() is called by the PML to
+ * determine the set of BTLs that should be used to reach each process.
+ * Any addressing information exported by the peer via the mca_base_modex_send()
+ * function should be available during this call via the corresponding
+ * mca_base_modex_recv() function. The BML may utilize this information to
+ * determine reachability of each peer process.
+ *
+ * \note This function will return OMPI_ERR_UNREACH if the process can not
+ * be reached by a currently active BTL. This is not a fatal error, and the
+ * calling layer is free to continue using the BML interface.
+ */
+typedef int (*mca_bml_base_module_add_proc_fn_t) (struct ompi_proc_t *proc);
+
 /**
 * Notification of change to the process list.
 *
@ -559,6 +575,7 @@ struct mca_bml_base_module_t {
    mca_bml_base_component_t* bml_component; /**< pointer back to the BML component structure */

    /* BML function table */
+    mca_bml_base_module_add_proc_fn_t      bml_add_proc;
    mca_bml_base_module_add_procs_fn_t     bml_add_procs;
    mca_bml_base_module_del_procs_fn_t     bml_del_procs;
    mca_bml_base_module_add_btl_fn_t       bml_add_btl;
--- a/ompi/mca/bml/r2/bml_r2.c
+++ b/ompi/mca/bml/r2/bml_r2.c
@ -10,7 +10,7 @@
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
- * Copyright (c) 2007-2014 Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * Copyright (c) 2008-2015 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2013      Intel, Inc. All rights reserved
@ -144,6 +144,293 @@ static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *bt
    }
 }

+static mca_bml_base_endpoint_t *mca_bml_r2_allocate_endpoint (ompi_proc_t *proc) {
+    mca_bml_base_endpoint_t *bml_endpoint;
+
+    /* allocate bml specific proc data */
+    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
+    if (NULL == bml_endpoint) {
+        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
+        return NULL;
+    }
+
+    /* preallocate space in array for max number of r2s */
+    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
+    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
+    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
+    bml_endpoint->btl_max_send_size = -1;
+    bml_endpoint->btl_proc = proc;
+    proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint;
+
+    bml_endpoint->btl_flags_or = 0;
+    return bml_endpoint;
+}
+
+static void mca_bml_r2_register_progress (mca_btl_base_module_t *btl)
+{
+    if (NULL != btl->btl_component->btl_progress) {
+        bool found = false;
+
+        for (size_t p = 0 ; p < mca_bml_r2.num_btl_progress ; ++p) {
+            if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
+                found = true;
+                break;
+            }
+        }
+
+        if (found == false) {
+            mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress++] =
+                btl->btl_component->btl_progress;
+            opal_progress_register (btl->btl_component->btl_progress);
+        }
+    }
+}
+
+static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_endpoint_t *bml_endpoint,
+                                        mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *btl_endpoint)
+{
+    mca_bml_base_btl_t* bml_btl = NULL;
+    int btl_flags = btl->btl_flags;
+    bool btl_in_use = false;
+    size_t size;
+
+    /* NTH: these flags should have been sanitized by the btl. Once that is verified these
+     * checks can be safely removed. */
+    if ((btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put)) {
+        opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
+                    " the %s BTL without any PUT function attached. Discard the flag !",
+                    btl->btl_component->btl_version.mca_component_name);
+        btl_flags ^= MCA_BTL_FLAGS_PUT;
+    }
+    if ((btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get)) {
+        opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
+                    " the %s BTL without any GET function attached. Discard the flag !",
+                    btl->btl_component->btl_version.mca_component_name);
+        btl_flags ^= MCA_BTL_FLAGS_GET;
+    }
+
+    if ((btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0) {
+        /* If no protocol specified, we have 2 choices: we ignore the BTL
+         * as we don't know which protocl to use, or we suppose that all
+         * BTLs support the send protocol. This is really a btl error as
+         * these flags should have been sanitized by the btl. */
+        btl_flags |= MCA_BTL_FLAGS_SEND;
+    }
+
+    if (btl_flags & MCA_BTL_FLAGS_SEND) {
+        /* dont allow an additional BTL with a lower exclusivity ranking */
+        bml_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_send, size - 1);
+        size = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send);
+
+        if (!bml_btl || bml_btl->btl->btl_exclusivity < btl->btl_exclusivity) {
+            /* this btl has higher exclusivity than an existing btl or none exists */
+
+            opal_output_verbose(1, opal_btl_base_framework.framework_output,
+                                "mca: bml: Using %s btl for send to %s on node %s",
+                                btl->btl_component->btl_version.mca_component_name,
+                                OMPI_NAME_PRINT(&proc->super.proc_name),
+                                proc->super.proc_hostname);
+
+            /* cache the endpoint on the proc */
+            if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
+                bml_btl = mca_bml_base_btl_array_insert (&bml_endpoint->btl_send);
+                bml_btl->btl = btl;
+                bml_btl->btl_endpoint = btl_endpoint;
+                bml_btl->btl_weight = 0;
+                bml_btl->btl_flags = btl_flags;
+
+                /**
+                 * calculate the bitwise OR of the btl flags
+                 */
+                bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
+            } else {
+                opal_output_verbose(20, opal_btl_base_framework.framework_output,
+                                    "mca: bml: Not using %s btl for send to %s on node %s "
+                                    "because %s btl has higher exclusivity (%d > %d)",
+                                    btl->btl_component->btl_version.mca_component_name,
+                                    OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
+                                    bml_btl->btl->btl_component->btl_version.mca_component_name,
+                                    bml_btl->btl->btl_exclusivity,
+                                    btl->btl_exclusivity);
+            }
+
+            btl_in_use = true;
+        }
+    }
+
+    /* always add rdma endpoints */
+    if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
+        !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
+          (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
+        mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
+
+        bml_btl_rdma->btl = btl;
+        bml_btl_rdma->btl_endpoint = btl_endpoint;
+        bml_btl_rdma->btl_weight = 0;
+        bml_btl_rdma->btl_flags = btl_flags;
+
+        if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
+            bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
+        }
+
+        if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
+            bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
+        }
+
+        btl_in_use = true;
+    }
+
+    return btl_in_use ? OMPI_SUCCESS : OMPI_ERR_NOT_AVAILABLE;
+}
+
+static void mca_bml_r2_compute_endpoint_metrics (mca_bml_base_endpoint_t *bml_endpoint)
+{
+    double total_bandwidth = 0;
+    uint32_t latency;
+    size_t n_send, n_rdma;
+
+    /* (1) determine the total bandwidth available across all btls
+     *     note that we need to do this here, as we may already have btls configured
+     * (2) determine the highest priority ranking for latency
+     * (3) compute the maximum amount of bytes that can be send without any
+     *     weighting. Once the left over is smaller than this number we will
+     *     start using the weight to compute the correct amount.
+     */
+    n_send = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send);
+    n_rdma = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
+
+    /* sort BTLs in descending order according to bandwidth value */
+    qsort (bml_endpoint->btl_send.bml_btls, n_send,
+           sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
+
+    bml_endpoint->btl_rdma_index = 0;
+
+    mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
+
+    /* (1) set the weight of each btl as a percentage of overall bandwidth
+     * (2) copy all btl instances at the highest priority ranking into the
+     *     list of btls used for first fragments
+     */
+    for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
+        mca_bml_base_btl_t *bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
+        mca_btl_base_module_t *btl = bml_btl->btl;
+
+        /* compute weighting factor for this r2 */
+        if(btl->btl_bandwidth > 0) {
+            bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
+        } else {
+            bml_btl->btl_weight = (float)(1.0 / n_send);
+        }
+
+        /* check to see if this r2 is already in the array of r2s
+         * used for first fragments - if not add it.
+         */
+        if(btl->btl_latency == latency) {
+            mca_bml_base_btl_t* bml_btl_new =
+                mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
+            *bml_btl_new = *bml_btl;
+        }
+
+        /* set endpoint max send size as min of available btls */
+        if (bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
+            bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
+    }
+
+    /* sort BTLs in descending order according to bandwidth value */
+    qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
+          sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
+
+    mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
+
+    /* set rdma btl weights */
+    for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
+        mca_bml_base_btl_t *bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
+
+        /* compute weighting factor for this r2 */
+        if (bml_btl->btl->btl_bandwidth > 0.0) {
+            bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
+        } else {
+            bml_btl->btl_weight = (float)(1.0 / n_rdma);
+        }
+    }
+}
+
+static int mca_bml_r2_add_proc (struct ompi_proc_t *proc)
+{
+    mca_bml_base_endpoint_t *bml_endpoint;
+    /* at least one btl is in use */
+    bool btl_in_use;
+    int rc;
+
+    if (OPAL_UNLIKELY(NULL == proc)) {
+        return OMPI_ERR_BAD_PARAM;
+    }
+
+    /* check if this endpoint is already set up */
+    if (NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
+        OBJ_RETAIN(proc);
+        return OMPI_SUCCESS;
+    }
+
+    /* add btls if not already done */
+    if (OMPI_SUCCESS != (rc = mca_bml_r2_add_btls())) {
+        return rc;
+    }
+
+    bml_endpoint = mca_bml_r2_allocate_endpoint (proc);
+    if (OPAL_UNLIKELY(NULL == bml_endpoint)) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    for (int p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) {
+        mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index];
+        struct mca_btl_base_endpoint_t *btl_endpoint = NULL;
+
+        /* if the r2 can reach the destination proc it sets the
+         * corresponding bit (proc index) in the reachable bitmap
+         * and can return addressing information for each proc
+         * that is passed back to the r2 on data transfer calls
+         */
+        rc = btl->btl_add_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint, NULL);
+        if (OMPI_SUCCESS != rc || NULL == btl_endpoint) {
+            /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL
+             * can take care of this task. */
+            continue;
+        }
+
+        rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoint);
+        if (OMPI_SUCCESS != rc) {
+            btl->btl_del_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint);
+        } else {
+            mca_bml_r2_register_progress (btl);
+            btl_in_use = true;
+        }
+    }
+
+    if (!btl_in_use) {
+        /* no btl is available for this proc */
+        if (mca_bml_r2.show_unreach_errors) {
+            opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
+                            OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
+                            (NULL != ompi_proc_local_proc->super.proc_hostname ?
+                             ompi_proc_local_proc->super.proc_hostname : "unknown!"),
+                            OMPI_NAME_PRINT(&(proc->super.proc_name)),
+                            (NULL != proc->super.proc_hostname ?
+                             proc->super.proc_hostname : "unknown!"),
+                            btl_names);
+        }
+
+        return OMPI_ERR_UNREACH;
+    }
+
+    /* compute metrics for registered btls */
+    mca_bml_r2_compute_endpoint_metrics (bml_endpoint);
+
+    return OMPI_SUCCESS;
+}
+
 /*
 *   For each proc setup a datastructure that indicates the BTLs
 *   that can be used to reach the destination.
@ -154,7 +441,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
                                 struct ompi_proc_t** procs,
                                 struct opal_bitmap_t* reachable )
 {
-    size_t p, p_index, n_new_procs = 0;
+    size_t n_new_procs = 0;
    struct mca_btl_base_endpoint_t ** btl_endpoints = NULL;
    struct ompi_proc_t** new_procs = NULL;
    int rc, ret = OMPI_SUCCESS;
@ -170,7 +457,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
    /* Select only the procs that don't yet have the BML proc struct. This prevent
     * us from calling btl->add_procs several times on the same destination proc.
     */
-    for(p_index = 0; p_index < nprocs; p_index++) {
+    for (size_t p_index = 0 ; p_index < nprocs ; ++p_index) {
        struct ompi_proc_t* proc = procs[p_index];

        if(NULL !=  proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
@ -203,10 +490,9 @@ static int mca_bml_r2_add_procs( size_t nprocs,
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

-    for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
-        mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
+    for (size_t p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) {
+        mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index];
        int btl_inuse = 0;
-        int btl_flags;

        /* if the r2 can reach the destination proc it sets the
         * corresponding bit (proc index) in the reachable bitmap
@ -217,240 +503,69 @@ static int mca_bml_r2_add_procs( size_t nprocs,
        memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*));

        rc = btl->btl_add_procs(btl, n_new_procs, (opal_proc_t**)new_procs, btl_endpoints, reachable);
-        if(OMPI_SUCCESS != rc) {
-            /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL
-             * can take care of this task.
-             */
+        if (OMPI_SUCCESS != rc) {
+            /* This BTL encountered an error while adding procs. Continue in case some other
+             * BTL(s) can be used. */
            continue;
        }

        /* for each proc that is reachable */
-        for( p = 0; p < n_new_procs; p++ ) {
-            if(opal_bitmap_is_set_bit(reachable, p)) {
-                ompi_proc_t *proc = new_procs[p];
-                mca_bml_base_endpoint_t * bml_endpoint =
-                    (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-                mca_bml_base_btl_t* bml_btl = NULL;
-                size_t size;
-
-                if(NULL == bml_endpoint) {
-                    /* allocate bml specific proc data */
-                    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
-                    if (NULL == bml_endpoint) {
-                        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
-                        free(btl_endpoints);
-                        free(new_procs);
-                        return OMPI_ERR_OUT_OF_RESOURCE;
-                    }
-
-                    /* preallocate space in array for max number of r2s */
-                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
-                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
-                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
-                    bml_endpoint->btl_max_send_size = -1;
-                    bml_endpoint->btl_proc = proc;
-                    proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint;
-
-                    bml_endpoint->btl_flags_or = 0;
-                }
-
-                btl_flags = btl->btl_flags;
-                if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
-                    opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
-                                " the %s BTL without any PUT function attached. Discard the flag !",
-                                btl->btl_component->btl_version.mca_component_name);
-                    btl_flags ^= MCA_BTL_FLAGS_PUT;
-                }
-                if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
-                    opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
-                                " the %s BTL without any GET function attached. Discard the flag !",
-                                btl->btl_component->btl_version.mca_component_name);
-                    btl_flags ^= MCA_BTL_FLAGS_GET;
-                }
-
-                if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
-                    /**
-                     * If no protocol specified, we have 2 choices: we ignore the BTL
-                     * as we don't know which protocl to use, or we suppose that all
-                     * BTLs support the send protocol.
-                     */
-                    btl_flags |= MCA_BTL_FLAGS_SEND;
-                }
-
-                /* dont allow an additional BTL with a lower exclusivity ranking */
-                size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
-                if(size > 0) {
-                    bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
-                    /* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */
-                    if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity  && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) {
-                        btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
-                        opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_bml_base_framework.framework_output,
-                                            "mca: bml: Not using %s btl to %s on node %s "
-                                            "because %s btl has higher exclusivity (%d > %d)",
-                                            btl->btl_component->btl_version.mca_component_name,
-                                            OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
-                                            bml_btl->btl->btl_component->btl_version.mca_component_name,
-                                            bml_btl->btl->btl_exclusivity,
-                                            btl->btl_exclusivity);
-                        continue;
-                    }
-                }
-                opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_bml_base_framework.framework_output,
-                                    "mca: bml: Using %s btl to %s on node %s",
-                                    btl->btl_component->btl_version.mca_component_name,
-                                    OMPI_NAME_PRINT(&proc->super.proc_name),
-                                    proc->super.proc_hostname);
-
-                /* cache the endpoint on the proc */
-                if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
-                    bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
-                    bml_btl->btl = btl;
-                    bml_btl->btl_endpoint = btl_endpoints[p];
-                    bml_btl->btl_weight = 0;
-                    bml_btl->btl_flags = btl_flags;
-
-                    /**
-                     * calculate the bitwise OR of the btl flags
-                     */
-                    bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
-                }
-
-                /* always add rdma endpoints */
-                if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
-                    !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
-                      (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
-                    mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
-
-                    bml_btl_rdma->btl = btl;
-                    bml_btl_rdma->btl_endpoint = btl_endpoints[p];
-                    bml_btl_rdma->btl_weight = 0;
-                    bml_btl_rdma->btl_flags = btl_flags;
-
-                    if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
-                        bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
-                    }
-
-                    if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
-                        bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
-                    }
-                }
-
-                /* This BTL is in use, allow the progress registration */
-                btl_inuse++;
+        for (size_t p = 0 ; p < n_new_procs ; ++p) {
+            if (!opal_bitmap_is_set_bit(reachable, p)) {
+                continue;
            }
+
+            ompi_proc_t *proc = new_procs[p];
+            mca_bml_base_endpoint_t *bml_endpoint =
+                (mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+            mca_bml_base_btl_t *bml_btl = NULL;
+            size_t size;
+
+            if (NULL == bml_endpoint) {
+                bml_endpoint = mca_bml_r2_allocate_endpoint (proc);
+                if (NULL == bml_endpoint) {
+                    free(btl_endpoints);
+                    free(new_procs);
+                    return OPAL_ERR_OUT_OF_RESOURCE;
+                }
+            }
+
+            rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoints[p]);
+            if (OMPI_SUCCESS != rc) {
+                btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
+                continue;
+            }
+
+            /* This BTL is in use, allow the progress registration */
+            btl_inuse++;
        }

-        if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
-            size_t p;
-            bool found = false;
-            for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) {
-                if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
-                    found = true;
-                    break;
-                }
-            }
-            if(found == false) {
-                mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] =
-                    btl->btl_component->btl_progress;
-                mca_bml_r2.num_btl_progress++;
-                opal_progress_register( btl->btl_component->btl_progress );
-            }
+        if (btl_inuse) {
+            mca_bml_r2_register_progress (btl);
        }
    }
+
    free(btl_endpoints);

    /* iterate back through procs and compute metrics for registered r2s */
-    for(p=0; p<n_new_procs; p++) {
-        ompi_proc_t *proc = new_procs[p];
-        mca_bml_base_endpoint_t* bml_endpoint =
-            (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-        double total_bandwidth = 0;
-        uint32_t latency;
-        size_t n_send, n_rdma;
+    for (size_t p = 0; p < n_new_procs ; ++p) {
+        mca_bml_base_endpoint_t *bml_endpoint =
+            (mca_bml_base_endpoint_t *) new_procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];

        /* skip over procs w/ no btl's registered */
-        if(NULL == bml_endpoint) {
-            continue;
-        }
-
-        /* (1) determine the total bandwidth available across all btls
-         *     note that we need to do this here, as we may already have btls configured
-         * (2) determine the highest priority ranking for latency
-         * (3) compute the maximum amount of bytes that can be send without any
-         *     weighting. Once the left over is smaller than this number we will
-         *     start using the weight to compute the correct amount.
-         */
-        n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
-        n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
-
-        /* sort BTLs in descending order according to bandwidth value */
-        qsort(bml_endpoint->btl_send.bml_btls, n_send,
-                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
-
-        bml_endpoint->btl_rdma_index = 0;
-
-        mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
-
-        /* (1) set the weight of each btl as a percentage of overall bandwidth
-         * (2) copy all btl instances at the highest priority ranking into the
-         *     list of btls used for first fragments
-         */
-        for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
-            mca_bml_base_btl_t* bml_btl =
-                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
-            mca_btl_base_module_t *btl = bml_btl->btl;
-
-            /* compute weighting factor for this r2 */
-            if(btl->btl_bandwidth > 0) {
-                bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
-            } else {
-                bml_btl->btl_weight = (float)(1.0 / n_send);
-            }
-
-            /* check to see if this r2 is already in the array of r2s
-             * used for first fragments - if not add it.
-             */
-            if(btl->btl_latency == latency) {
-                mca_bml_base_btl_t* bml_btl_new =
-                    mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
-                *bml_btl_new = *bml_btl;
-            }
-
-            /* set endpoint max send size as min of available btls */
-            if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
-               bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
-        }
-
-        /* sort BTLs in descending order according to bandwidth value */
-        qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
-                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
-
-        mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
-
-        /* set rdma btl weights */
-        for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
-            mca_bml_base_btl_t *bml_btl =
-                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
-
-            /* compute weighting factor for this r2 */
-            if (bml_btl->btl->btl_bandwidth > 0.0) {
-                bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
-            } else {
-                bml_btl->btl_weight = (float)(1.0 / n_rdma);
-            }
+        if (NULL != bml_endpoint) {
+            mca_bml_r2_compute_endpoint_metrics (bml_endpoint);
        }
    }

    /* see if we have a connection to everyone else */
-    for(p = 0; p < n_new_procs; p++) {
+    for(size_t p = 0; p < n_new_procs ; ++p) {
        ompi_proc_t *proc = new_procs[p];

        if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
            ret = OMPI_ERR_UNREACH;
            if (mca_bml_r2.show_unreach_errors) {
-                opal_show_help("help-mca-bml-r2.txt",
-                               "unreachable proc",
-                               true,
+                opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
                               OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
                               (NULL != ompi_proc_local_proc->super.proc_hostname ?
                                ompi_proc_local_proc->super.proc_hostname : "unknown!"),
@ -459,6 +574,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
                                proc->super.proc_hostname : "unknown!"),
                               btl_names);
            }
+
            break;
        }
    }
@ -476,7 +592,6 @@ static int mca_bml_r2_add_procs( size_t nprocs,
 static int mca_bml_r2_del_procs(size_t nprocs,
                                struct ompi_proc_t** procs)
 {
-    size_t p;
    int rc;
    struct ompi_proc_t** del_procs = (struct ompi_proc_t**)
        malloc(nprocs * sizeof(struct ompi_proc_t*));
@ -486,26 +601,27 @@ static int mca_bml_r2_del_procs(size_t nprocs,
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

-    for(p = 0; p < nprocs; p++) {
+    for (size_t p = 0 ; p < nprocs ; ++p) {
        ompi_proc_t *proc = procs[p];
        /* We much check that there are 2 references to the proc (not 1). The
         * first reference belongs to ompi/proc the second belongs to the bml
         * since we retained it. We will release that reference at the end of
         * the loop below. */
-        if(((opal_object_t*)proc)->obj_reference_count == 2) {
+        if (((opal_object_t*)proc)->obj_reference_count == 2 &&
+            NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
            del_procs[n_del_procs++] = proc;
        }
    }

-    for(p = 0; p < n_del_procs; p++) {
+    for (size_t p = 0 ; p < n_del_procs ; ++p) {
        ompi_proc_t *proc = del_procs[p];
        mca_bml_base_endpoint_t* bml_endpoint =
            (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-        size_t f_index, f_size;
+        size_t f_size;

        /* notify each btl that the proc is going away */
        f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
-        for(f_index = 0; f_index < f_size; f_index++) {
+        for (size_t f_index = 0 ; f_index < f_size ; ++f_index) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, f_index);
            mca_btl_base_module_t* btl = bml_btl->btl;

@ -521,10 +637,12 @@ static int mca_bml_r2_del_procs(size_t nprocs,
             */
        }

+        proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
+
        OBJ_RELEASE(proc);
+
        /* do any required cleanup */
        OBJ_RELEASE(bml_endpoint);
-        proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
    }
    free(del_procs);

@ -835,6 +953,7 @@ int mca_bml_r2_component_fini(void)
 mca_bml_r2_module_t mca_bml_r2 = {
    .super = {
        .bml_component = &mca_bml_r2_component,
+        .bml_add_proc = mca_bml_r2_add_proc,
        .bml_add_procs = mca_bml_r2_add_procs,
        .bml_del_procs = mca_bml_r2_del_procs,
        .bml_add_btl = mca_bml_r2_add_btl,
@ -843,8 +962,7 @@ mca_bml_r2_module_t mca_bml_r2 = {
        .bml_register = mca_bml_r2_register,
        .bml_register_error = mca_bml_r2_register_error,
        .bml_finalize = mca_bml_r2_finalize,
-        .bml_ft_event = mca_bml_r2_ft_event
-    }
-
+        .bml_ft_event = mca_bml_r2_ft_event,
+    },
 };