diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c
index 59e812a5fd..1789f714fc 100644
--- a/ompi/communicator/comm_cid.c
+++ b/ompi/communicator/comm_cid.c
@@ -139,54 +139,8 @@ static opal_mutex_t ompi_cid_lock;
 static opal_list_t ompi_registered_comms;
 
 
-/* This variable is zero (false) if all processes in MPI_COMM_WORLD
- * did not require MPI_THREAD_MULTIPLE support, and is 1 (true) as
- * soon as at least one process requested support for THREAD_MULTIPLE */
-static int ompi_comm_world_thread_level_mult=0;
-
-
 int ompi_comm_cid_init (void)
 {
-#if OMPI_ENABLE_THREAD_MULTIPLE
-    ompi_proc_t **procs, *thisproc;
-    uint8_t thread_level;
-    uint8_t *tlpointer;
-    int ret;
-    size_t i, size, numprocs;
-
-    /** Note that the following call only returns processes
-     * with the same jobid. This is on purpose, since
-     * we switch for the dynamic communicators anyway
-     * to the original (slower) cid allocation algorithm.
-     */
-    procs = ompi_proc_world ( &numprocs );
-
-    for ( i=0; i<numprocs; i++ ) {
-        thisproc = procs[i];
-
-        OPAL_MODEX_RECV_STRING(ret, "MPI_THREAD_LEVEL",
-                               &thisproc->super.proc_name,
-                               (uint8_t**)&tlpointer, &size);
-        if (OMPI_SUCCESS == ret) {
-            thread_level = *((uint8_t *) tlpointer);
-            if ( OMPI_THREADLEVEL_IS_MULTIPLE (thread_level) ) {
-                ompi_comm_world_thread_level_mult = 1;
-                break;
-            }
-        } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
-            if (ompi_mpi_thread_multiple) {
-                ompi_comm_world_thread_level_mult = 1;
-            }
-            break;
-        } else {
-            return ret;
-        }
-    }
-    free(procs);
-#else
-    ompi_comm_world_thread_level_mult = 0; // silence compiler warning if not used
-#endif
-
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c
index 2b761d93df..a7f302bbd4 100644
--- a/ompi/communicator/comm_init.c
+++ b/ompi/communicator/comm_init.c
@@ -13,7 +13,7 @@
  * Copyright (c) 2006-2010 University of Houston. All rights reserved.
  * Copyright (c) 2007-2012 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2009      Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2012-2014 Los Alamos National Security, LLC.
+ * Copyright (c) 2012-2015 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2011-2013 Inria.  All rights reserved.
  * Copyright (c) 2011-2013 Universite Bordeaux 1
@@ -102,12 +102,26 @@ int ompi_comm_init(void)
     OBJ_CONSTRUCT(&ompi_mpi_comm_world, ompi_communicator_t);
     assert(ompi_mpi_comm_world.comm.c_f_to_c_index == 0);
     group = OBJ_NEW(ompi_group_t);
-    group->grp_proc_pointers = ompi_proc_world(&size);
-    group->grp_proc_count    = (int)size;
+
+    size = ompi_process_info.num_procs;
+    group->grp_proc_pointers = (ompi_proc_t **) calloc (size, sizeof (ompi_proc_t *));
+    group->grp_proc_count = size;
+
+    for (size_t i = 0 ; i < size ; ++i) {
+        opal_process_name_t name = {.vpid = i, .jobid = OMPI_PROC_MY_NAME->jobid};
+        /* look for existing ompi_proc_t that matches this name */
+        group->grp_proc_pointers[i] = (ompi_proc_t *) ompi_proc_lookup (name);
+        if (NULL == group->grp_proc_pointers[i]) {
+            /* set sentinel value */
+            group->grp_proc_pointers[i] = (ompi_proc_t *) ompi_proc_name_to_sentinel (name);
+        } else {
+            OBJ_RETAIN (group->grp_proc_pointers[i]);
+        }
+    }
+
     OMPI_GROUP_SET_INTRINSIC (group);
     OMPI_GROUP_SET_DENSE (group);
     ompi_set_group_rank(group, ompi_proc_local());
-    ompi_group_increment_proc_count (group);
 
     ompi_mpi_comm_world.comm.c_contextid    = 0;
     ompi_mpi_comm_world.comm.c_id_start_index = 4;
diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c
index a9e7f6c960..2d298127a6 100644
--- a/ompi/dpm/dpm.c
+++ b/ompi/dpm/dpm.c
@@ -13,7 +13,7 @@
  * Copyright (c) 2007-2015 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2006-2009 University of Houston.  All rights reserved.
  * Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
- * Copyright (c) 2011-2013 Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2011-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * Copyright (c) 2013-2015 Intel, Inc. All rights reserved
  * Copyright (c) 2014-2015 Research Organization for Information Science
@@ -1293,6 +1293,22 @@ static int disconnect_waitall (int count, ompi_dpm_disconnect_obj **objs)
 /**********************************************************************/
 /**********************************************************************/
 /**********************************************************************/
+static bool ompi_dpm_group_is_dyn (ompi_group_t *group, ompi_jobid_t thisjobid)
+{
+    int size = group ? ompi_group_size (group) : 0;
+
+    for (int i = 1 ; i < size ; ++i) {
+        opal_process_name_t name = ompi_group_get_proc_name (group, i);
+
+        if (thisjobid != ((ompi_process_name_t *) &name)->jobid) {
+            /* at least one is different */
+            return true;
+        }
+    }
+
+    return false;
+}
+
 /* All we want to do in this function is determine if the number of
  * jobids in the local and/or remote group is > 1. This tells us to
  * set the disconnect flag. We don't actually care what the true
@@ -1300,56 +1316,30 @@ static int disconnect_waitall (int count, ompi_dpm_disconnect_obj **objs)
  */
 void ompi_dpm_mark_dyncomm(ompi_communicator_t *comm)
 {
-    int i;
-    int size, rsize;
-    bool found=false;
+    bool found;
     ompi_jobid_t thisjobid;
-    ompi_group_t *grp=NULL;
-    ompi_proc_t *proc = NULL;
 
     /* special case for MPI_COMM_NULL */
     if (comm == MPI_COMM_NULL) {
         return;
     }
 
-    size  = ompi_comm_size(comm);
-    rsize = ompi_comm_remote_size(comm);
+    thisjobid = ompi_group_get_proc_name (comm->c_local_group, 0).jobid;
 
     /* loop over all processes in local group and check for
      * a different jobid
      */
-    grp = comm->c_local_group;
-    proc = ompi_group_peer_lookup(grp,0);
-    thisjobid = ((ompi_process_name_t*)&proc->super.proc_name)->jobid;
-
-    for (i=1; i< size; i++) {
-        proc = ompi_group_peer_lookup(grp,i);
-        if (thisjobid != ((ompi_process_name_t*)&proc->super.proc_name)->jobid) {
-            /* at least one is different */
-            found = true;
-            goto complete;
-        }
+    found = ompi_dpm_group_is_dyn (comm->c_local_group, thisjobid);
+    if (!found) {
+        /* if inter-comm, loop over all processes in remote_group
+         * and see if any are different from thisjobid
+         */
+        found = ompi_dpm_group_is_dyn (comm->c_remote_group, thisjobid);
     }
 
-    /* if inter-comm, loop over all processes in remote_group
-     * and see if any are different from thisjobid
-     */
-    grp = comm->c_remote_group;
-    for (i=0; i< rsize; i++) {
-        proc = ompi_group_peer_lookup(grp,i);
-        if (thisjobid != ((ompi_process_name_t*)&proc->super.proc_name)->jobid) {
-            /* at least one is different */
-            found = true;
-            break;
-        }
-    }
-
- complete:
     /* if a different jobid was found, set the disconnect flag*/
     if (found) {
         ompi_comm_num_dyncomm++;
         OMPI_COMM_SET_DYNAMIC(comm);
     }
-
-    return;
 }
diff --git a/ompi/group/group.c b/ompi/group/group.c
index d489028d58..fe0b60c246 100644
--- a/ompi/group/group.c
+++ b/ompi/group/group.c
@@ -14,7 +14,7 @@
  * Copyright (c) 2007      Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2012      Oak Ridge National Labs.  All rights reserved.
  * Copyright (c) 2012-2013 Inria.  All rights reserved.
- * Copyright (c) 2013      Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2013-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
@@ -49,16 +49,14 @@ int ompi_group_translate_ranks ( ompi_group_t *group1,
                                  ompi_group_t *group2,
                                  int *ranks2)
 {
-    int rank, proc, proc2;
-    struct ompi_proc_t *proc1_pointer, *proc2_pointer;
-
     if ( MPI_GROUP_EMPTY == group1 || MPI_GROUP_EMPTY == group2 ) {
-        for (proc = 0; proc < n_ranks ; proc++) {
+        for (int proc = 0; proc < n_ranks ; ++proc) {
             ranks2[proc] = MPI_UNDEFINED;
         }
         return MPI_SUCCESS;
     }
 
+#if OMPI_GROUP_SPARSE
     /*
      * If we are translating from a parent to a child that uses the sparse format
      * or vice versa, we use the translate ranks function corresponding to the
@@ -80,8 +78,11 @@ int ompi_group_translate_ranks ( ompi_group_t *group1,
                 (group1,n_ranks,ranks1,group2,ranks2);
         }
 
+        /* unknown sparse group type */
+        assert (0);
     }
-    else if( group2->grp_parent_group_ptr == group1 ) { /* from parent to child*/
+
+    if( group2->grp_parent_group_ptr == group1 ) { /* from parent to child*/
         if(OMPI_GROUP_IS_SPORADIC(group2)) {
             return ompi_group_translate_ranks_sporadic
                 (group1,n_ranks,ranks1,group2,ranks2);
@@ -95,28 +96,32 @@ int ompi_group_translate_ranks ( ompi_group_t *group1,
                 (group1,n_ranks,ranks1,group2,ranks2);
         }
 
+        /* unknown sparse group type */
+        assert (0);
     }
-    else {
-        /* loop over all ranks */
-        for (proc = 0; proc < n_ranks; proc++) {
-            rank=ranks1[proc];
-            if ( MPI_PROC_NULL == rank) {
-                ranks2[proc] = MPI_PROC_NULL;
-            }
-            else {
-                proc1_pointer = ompi_group_peer_lookup(group1 ,rank);
-                /* initialize to no "match" */
-                ranks2[proc] = MPI_UNDEFINED;
-                for (proc2 = 0; proc2 < group2->grp_proc_count; proc2++) {
-                    proc2_pointer= ompi_group_peer_lookup(group2, proc2);
-                    if ( proc1_pointer == proc2_pointer) {
-                        ranks2[proc] = proc2;
-                        break;
-                    }
-                }  /* end proc2 loop */
-            } /* end proc loop */
+#endif
+
+    /* loop over all ranks */
+    for (int proc = 0; proc < n_ranks; ++proc) {
+        struct ompi_proc_t *proc1_pointer, *proc2_pointer;
+        int rank = ranks1[proc];
+
+        if ( MPI_PROC_NULL == rank) {
+            ranks2[proc] = MPI_PROC_NULL;
+            continue;
         }
-    }
+
+        proc1_pointer = ompi_group_get_proc_ptr_raw (group1, rank);
+        /* initialize to no "match" */
+        ranks2[proc] = MPI_UNDEFINED;
+        for (int proc2 = 0; proc2 < group2->grp_proc_count; ++proc2) {
+            proc2_pointer = ompi_group_get_proc_ptr_raw (group2, proc2);
+            if ( proc1_pointer == proc2_pointer) {
+                ranks2[proc] = proc2;
+                break;
+            }
+        }  /* end proc2 loop */
+    } /* end proc loop */
 
     return MPI_SUCCESS;
 }
@@ -168,25 +173,6 @@ int ompi_group_dump (ompi_group_t* group)
     return OMPI_SUCCESS;
 }
 
-/*
- * This is the function that iterates through the sparse groups to the dense group
- * to reach the process pointer
- */
-ompi_proc_t* ompi_group_get_proc_ptr (ompi_group_t* group , int rank)
-{
-    int ranks1,ranks2;
-    do {
-        if(OMPI_GROUP_IS_DENSE(group)) {
-            return group->grp_proc_pointers[rank];
-        }
-        ranks1 = rank;
-        ompi_group_translate_ranks( group, 1, &ranks1,
-                                    group->grp_parent_group_ptr,&ranks2);
-        rank = ranks2;
-        group = group->grp_parent_group_ptr;
-    } while (1);
-}
-
 int ompi_group_minloc ( int list[] , int length )
 {
     int i,index,min;
@@ -568,3 +554,23 @@ int ompi_group_compare(ompi_group_t *group1,
 
     return return_value;
 }
+
+bool ompi_group_have_remote_peers (ompi_group_t *group)
+{
+    for (size_t i = 0 ; i < group->grp_proc_count ; ++i) {
+        ompi_proc_t *proc = NULL;
+#if OMPI_GROUP_SPARSE
+        proc = ompi_group_peer_lookup (group, i);
+#else
+        if ((intptr_t) group->grp_proc_pointers[i] < 0) {
+            return true;
+        }
+        proc = group->grp_proc_pointers[i];
+#endif
+        if (!OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
+            return true;
+        }
+    }
+
+    return false;
+}
diff --git a/ompi/group/group.h b/ompi/group/group.h
index 797f52933c..0f8871fb7f 100644
--- a/ompi/group/group.h
+++ b/ompi/group/group.h
@@ -14,7 +14,7 @@
  * Copyright (c) 2007-2012 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2009      Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2012      Oak Ridge National Labs.  All rights reserved.
- * Copyright (c) 2013      Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2013-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * $COPYRIGHT$
  *
@@ -252,8 +252,6 @@ int ompi_group_free (ompi_group_t **group);
 /**
  * Functions to handle process pointers for sparse group formats
  */
-OMPI_DECLSPEC ompi_proc_t* ompi_group_get_proc_ptr (ompi_group_t* group , int rank);
-
 int ompi_group_translate_ranks_sporadic ( ompi_group_t *group1,
                                  int n_ranks, const int *ranks1,
                                  ompi_group_t *group2,
@@ -324,25 +322,93 @@ int ompi_group_calc_bmap ( int n, int orig_size , const int *ranks );
  */
 int ompi_group_minloc (int list[], int length);
 
+/**
+ * @brief Helper function for retreiving the proc of a group member in a dense group
+ *
+ * This function exists to handle the translation of sentinel group members to real
+ * ompi_proc_t's. If a sentinel value is found and allocate is true then this function
+ * looks for an existing ompi_proc_t using ompi_proc_for_name which will allocate a
+ * ompi_proc_t if one does not exist. If allocate is false then sentinel values translate
+ * to NULL.
+ */
+static inline struct ompi_proc_t *ompi_group_dense_lookup (ompi_group_t *group, const int peer_id, const bool allocate)
+{
+#if OPAL_ENABLE_DEBUG
+    if (peer_id >= group->grp_proc_count) {
+        opal_output(0, "ompi_group_dense_lookup: invalid peer index (%d)", peer_id);
+        return (struct ompi_proc_t *) NULL;
+    }
+#endif
+
+    if (OPAL_UNLIKELY((intptr_t) group->grp_proc_pointers[peer_id] < 0)) {
+        if (!allocate) {
+            return NULL;
+        }
+
+        /* replace sentinel value with an actual ompi_proc_t */
+        group->grp_proc_pointers[peer_id] =
+            (ompi_proc_t *) ompi_proc_for_name (ompi_proc_sentinel_to_name ((intptr_t) group->grp_proc_pointers[peer_id]));
+        OBJ_RETAIN(group->grp_proc_pointers[peer_id]);
+    }
+
+    return group->grp_proc_pointers[peer_id];
+}
+
+/*
+ * This is the function that iterates through the sparse groups to the dense group
+ * to reach the process pointer
+ */
+static inline ompi_proc_t *ompi_group_get_proc_ptr (ompi_group_t *group, int rank, const bool allocate)
+{
+#if OMPI_GROUP_SPARSE
+    do {
+        if (OMPI_GROUP_IS_DENSE(group)) {
+            return ompi_group_dense_lookup (group, peer_id, allocate);
+        }
+        int ranks1 = rank;
+        ompi_group_translate_ranks (group, 1, &ranks1, group->grp_parent_group_ptr, &rank);
+        group = group->grp_parent_group_ptr;
+    } while (1);
+#else
+    return ompi_group_dense_lookup (group, rank, allocate);
+#endif
+}
+
+/**
+ * @brief Get the raw proc pointer from the group
+ *
+ * This function will either return a ompi_proc_t if one exists (either stored in the group
+ * or cached in the proc hash table) or a sentinel value representing the proc. This
+ * differs from ompi_group_get_proc_ptr() which returns the ompi_proc_t or NULL.
+ */
+ompi_proc_t *ompi_group_get_proc_ptr_raw (ompi_group_t *group, int rank);
+
+static inline opal_process_name_t ompi_group_get_proc_name (ompi_group_t *group, int rank)
+{
+    ompi_proc_t *proc = ompi_group_get_proc_ptr_raw (group, rank);
+    if ((intptr_t) proc < 0) {
+        return ompi_proc_sentinel_to_name ((intptr_t) proc);
+    }
+
+    return proc->super.proc_name;
+}
+
 /**
  * Inline function to check if sparse groups are enabled and return the direct access
  * to the proc pointer, otherwise the lookup function
  */
 static inline struct ompi_proc_t* ompi_group_peer_lookup(ompi_group_t *group, int peer_id)
 {
-#if OPAL_ENABLE_DEBUG
-    if (peer_id >= group->grp_proc_count) {
-        opal_output(0, "ompi_group_lookup_peer: invalid peer index (%d)", peer_id);
-        return (struct ompi_proc_t *) NULL;
-    }
-#endif
-#if OMPI_GROUP_SPARSE
-    return ompi_group_get_proc_ptr (group, peer_id);
-#else
-    return group->grp_proc_pointers[peer_id];
-#endif
+    return ompi_group_get_proc_ptr (group, peer_id, true);
 }
 
+static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t *group, int peer_id)
+{
+    return ompi_group_get_proc_ptr (group, peer_id, false);
+}
+
+bool ompi_group_have_remote_peers (ompi_group_t *group);
+
 /**
  *  Function to print the group info
  */
diff --git a/ompi/group/group_init.c b/ompi/group/group_init.c
index 67e5af61e4..5352493c4f 100644
--- a/ompi/group/group_init.c
+++ b/ompi/group/group_init.c
@@ -210,14 +210,13 @@ ompi_group_t *ompi_group_allocate_bmap(int orig_group_size , int group_size)
  */
 void ompi_group_increment_proc_count(ompi_group_t *group)
 {
-    int proc;
     ompi_proc_t * proc_pointer;
-    for (proc = 0; proc < group->grp_proc_count; proc++) {
-        proc_pointer = ompi_group_peer_lookup(group,proc);
-        OBJ_RETAIN(proc_pointer);
+    for (int proc = 0 ; proc < group->grp_proc_count ; ++proc) {
+	proc_pointer = ompi_group_peer_lookup_existing (group, proc);
+	if (proc_pointer) {
+	    OBJ_RETAIN(proc_pointer);
+	}
     }
-
-    return;
 }
 
 /*
@@ -226,14 +225,13 @@ void ompi_group_increment_proc_count(ompi_group_t *group)
 
 void ompi_group_decrement_proc_count(ompi_group_t *group)
 {
-    int proc;
     ompi_proc_t * proc_pointer;
-    for (proc = 0; proc < group->grp_proc_count; proc++) {
-        proc_pointer = ompi_group_peer_lookup(group,proc);
-        OBJ_RELEASE(proc_pointer);
+    for (int proc = 0 ; proc < group->grp_proc_count ; ++proc) {
+	proc_pointer = ompi_group_peer_lookup_existing (group, proc);
+	if (proc_pointer) {
+	    OBJ_RELEASE(proc_pointer);
+	}
     }
-
-    return;
 }
 
 /*
@@ -255,9 +253,6 @@ static void ompi_group_construct(ompi_group_t *new_group)
 
     /* default the sparse values for groups */
     new_group->grp_parent_group_ptr = NULL;
-
-    /* return */
-    return;
 }
 
 
@@ -300,9 +295,6 @@ static void ompi_group_destruct(ompi_group_t *group)
         opal_pointer_array_set_item(&ompi_group_f_to_c_table,
                                     group->grp_f_to_c_index, NULL);
     }
-
-    /* return */
-    return;
 }
 
 
diff --git a/ompi/group/group_plist.c b/ompi/group/group_plist.c
index ebf2f1a85a..0f17422e22 100644
--- a/ompi/group/group_plist.c
+++ b/ompi/group/group_plist.c
@@ -12,7 +12,7 @@
  *                         All rights reserved.
  * Copyright (c) 2006-2007 University of Houston. All rights reserved.
  * Copyright (c) 2007      Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2013      Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2013-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * $COPYRIGHT$
  *
@@ -29,6 +29,66 @@
 
 #include <math.h>
 
+static int ompi_group_dense_overlap (ompi_group_t *group1, ompi_group_t *group2, opal_bitmap_t *bitmap)
+{
+    ompi_proc_t *proc1_pointer, *proc2_pointer;
+    int rc, overlap_count;
+
+    overlap_count = 0;
+
+    for (int proc1 = 0 ; proc1 < group1->grp_proc_count ; ++proc1) {
+        proc1_pointer = ompi_group_get_proc_ptr_raw (group1, proc1);
+
+        /* check to see if this proc is in group2 */
+        for (int proc2 = 0 ; proc2 < group2->grp_proc_count ; ++proc2) {
+            proc2_pointer = ompi_group_get_proc_ptr_raw (group2, proc2);
+            if( proc1_pointer == proc2_pointer ) {
+                rc = opal_bitmap_set_bit (bitmap, proc2);
+                if (OPAL_SUCCESS != rc) {
+                    return rc;
+                }
+                ++overlap_count;
+
+                break;
+            }
+        }  /* end proc1 loop */
+    }  /* end proc loop */
+
+    return overlap_count;
+}
+
+static struct ompi_proc_t *ompi_group_dense_lookup_raw (ompi_group_t *group, const int peer_id)
+{
+    if (OPAL_UNLIKELY((intptr_t) group->grp_proc_pointers[peer_id] < 0)) {
+        ompi_proc_t *proc =
+            (ompi_proc_t *) ompi_proc_lookup (ompi_proc_sentinel_to_name ((intptr_t) group->grp_proc_pointers[peer_id]));
+        if (NULL != proc) {
+            /* replace sentinel value with an actual ompi_proc_t */
+            group->grp_proc_pointers[peer_id] = proc;
+            /* retain the proc */
+            OBJ_RETAIN(group->grp_proc_pointers[peer_id]);
+        }
+    }
+
+    return group->grp_proc_pointers[peer_id];
+}
+
+ompi_proc_t *ompi_group_get_proc_ptr_raw (ompi_group_t *group, int rank)
+{
+#if OMPI_GROUP_SPARSE
+    do {
+        if (OMPI_GROUP_IS_DENSE(group)) {
+            return ompi_group_dense_lookup_raw (group, peer_id);
+        }
+        int ranks1 = rank;
+        ompi_group_translate_ranks (group, 1, &ranks1, group->grp_parent_group_ptr, &rank);
+        group = group->grp_parent_group_ptr;
+    } while (1);
+#else
+    return ompi_group_dense_lookup_raw (group, rank);
+#endif
+}
+
 int ompi_group_calc_plist ( int n , const int *ranks ) {
     return sizeof(char *) * n ;
 }
@@ -37,9 +97,8 @@ int ompi_group_incl_plist(ompi_group_t* group, int n, const int *ranks,
                           ompi_group_t **new_group)
 {
     /* local variables */
-    int proc,my_group_rank;
+    int my_group_rank;
     ompi_group_t *group_pointer, *new_group_pointer;
-    ompi_proc_t *my_proc_pointer;
 
     group_pointer = (ompi_group_t *)group;
 
@@ -56,9 +115,9 @@ int ompi_group_incl_plist(ompi_group_t* group, int n, const int *ranks,
     }
 
     /* put group elements in the list */
-    for (proc = 0; proc < n; proc++) {
+    for (int proc = 0; proc < n; proc++) {
         new_group_pointer->grp_proc_pointers[proc] =
-            ompi_group_peer_lookup(group_pointer,ranks[proc]);
+            ompi_group_get_proc_ptr_raw (group_pointer, ranks[proc]);
     }                           /* end proc loop */
 
     /* increment proc reference counters */
@@ -67,10 +126,8 @@ int ompi_group_incl_plist(ompi_group_t* group, int n, const int *ranks,
     /* find my rank */
     my_group_rank=group_pointer->grp_my_rank;
     if (MPI_UNDEFINED != my_group_rank) {
-        my_proc_pointer=ompi_group_peer_lookup (group_pointer,my_group_rank);
-        ompi_set_group_rank(new_group_pointer,my_proc_pointer);
-    }
-    else {
+        ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc);
+    } else {
         new_group_pointer->grp_my_rank = MPI_UNDEFINED;
     }
 
@@ -87,114 +144,77 @@ int ompi_group_union (ompi_group_t* group1, ompi_group_t* group2,
                       ompi_group_t **new_group)
 {
     /* local variables */
-    int new_group_size, proc1, proc2, found_in_group;
-    int my_group_rank, cnt;
-    ompi_group_t *group1_pointer, *group2_pointer, *new_group_pointer;
-    ompi_proc_t *proc1_pointer, *proc2_pointer, *my_proc_pointer = NULL;
-
-    group1_pointer = (ompi_group_t *) group1;
-    group2_pointer = (ompi_group_t *) group2;
+    int new_group_size, cnt, rc, overlap_count;
+    ompi_group_t *new_group_pointer;
+    ompi_proc_t *proc2_pointer;
+    opal_bitmap_t bitmap;
 
     /*
      * form union
      */
 
     /* get new group size */
-    new_group_size = group1_pointer->grp_proc_count;
+    OBJ_CONSTRUCT(&bitmap, opal_bitmap_t);
+    rc = opal_bitmap_init (&bitmap, 32);
+    if (OPAL_SUCCESS != rc) {
+        return rc;
+    }
 
     /* check group2 elements to see if they need to be included in the list */
-    for (proc2 = 0; proc2 < group2_pointer->grp_proc_count; proc2++) {
-        proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2);
-
-        /* check to see if this proc2 is alread in the group */
-        found_in_group = 0;
-        for (proc1 = 0; proc1 < group1_pointer->grp_proc_count; proc1++) {
-            proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1);
-
-            if (proc1_pointer == proc2_pointer) {
-                /* proc2 is in group1 - don't double count */
-                found_in_group = 1;
-                break;
-            }
-        }                       /* end proc1 loop */
-
-        if (found_in_group) {
-            continue;
-        }
-
-        new_group_size++;
-    }                           /* end proc loop */
+    overlap_count = ompi_group_dense_overlap (group1, group2, &bitmap);
+    if (0 > overlap_count) {
+        OBJ_DESTRUCT(&bitmap);
+        return overlap_count;
+    }
 
+    new_group_size = group1->grp_proc_count + group2->grp_proc_count - overlap_count;
     if ( 0 == new_group_size ) {
         *new_group = MPI_GROUP_EMPTY;
         OBJ_RETAIN(MPI_GROUP_EMPTY);
+        OBJ_DESTRUCT(&bitmap);
         return MPI_SUCCESS;
     }
 
     /* get new group struct */
     new_group_pointer = ompi_group_allocate(new_group_size);
     if (NULL == new_group_pointer) {
+        OBJ_DESTRUCT(&bitmap);
         return MPI_ERR_GROUP;
     }
 
     /* fill in the new group list */
 
     /* put group1 elements in the list */
-    for (proc1 = 0; proc1 < group1_pointer->grp_proc_count; proc1++) {
+    for (int proc1 = 0; proc1 < group1->grp_proc_count; ++proc1) {
         new_group_pointer->grp_proc_pointers[proc1] =
-            ompi_group_peer_lookup(group1_pointer,proc1);
+            ompi_group_get_proc_ptr_raw (group1, proc1);
     }
-    cnt = group1_pointer->grp_proc_count;
+    cnt = group1->grp_proc_count;
 
     /* check group2 elements to see if they need to be included in the list */
-    for (proc2 = 0; proc2 < group2_pointer->grp_proc_count; proc2++) {
-        proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2);
-
-        /* check to see if this proc2 is alread in the group */
-        found_in_group = 0;
-        for (proc1 = 0; proc1 < group1_pointer->grp_proc_count; proc1++) {
-            proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1);
-
-            if (proc1_pointer == proc2_pointer) {
-                /* proc2 is in group1 - don't double count */
-                found_in_group = 1;
-                break;
-            }
-        }                       /* end proc1 loop */
-
-        if (found_in_group) {
+    for (int proc2 = 0; proc2 < group2->grp_proc_count; ++proc2) {
+        if (opal_bitmap_is_set_bit (&bitmap, proc2)) {
             continue;
         }
 
-        new_group_pointer->grp_proc_pointers[cnt] =
-            ompi_group_peer_lookup(group2_pointer,proc2);
-        cnt++;
+        proc2_pointer = ompi_group_get_proc_ptr_raw (group2, proc2);
+        new_group_pointer->grp_proc_pointers[cnt++] = proc2_pointer;
     }                           /* end proc loop */
 
+    OBJ_DESTRUCT(&bitmap);
+
     /* increment proc reference counters */
     ompi_group_increment_proc_count(new_group_pointer);
 
     /* find my rank */
-    my_group_rank = group1_pointer->grp_my_rank;
-    if (MPI_UNDEFINED == my_group_rank) {
-        my_group_rank = group2_pointer->grp_my_rank;
-        if ( MPI_UNDEFINED != my_group_rank) {
-            my_proc_pointer = ompi_group_peer_lookup(group2_pointer,my_group_rank);
-        }
+    if (MPI_UNDEFINED != group1->grp_my_rank || MPI_UNDEFINED != group2->grp_my_rank) {
+        ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc);
     } else {
-        my_proc_pointer = ompi_group_peer_lookup(group1_pointer,my_group_rank);
-    }
-
-    if ( MPI_UNDEFINED == my_group_rank ) {
         new_group_pointer->grp_my_rank = MPI_UNDEFINED;
     }
-    else {
-        ompi_set_group_rank(new_group_pointer, my_proc_pointer);
-    }
 
     *new_group = (MPI_Group) new_group_pointer;
 
-
     return OMPI_SUCCESS;
 }
 
@@ -206,96 +226,65 @@ int ompi_group_difference(ompi_group_t* group1, ompi_group_t* group2,
                           ompi_group_t **new_group) {
 
     /* local varibles */
-    int new_group_size, proc1, proc2, found_in_group2, cnt;
-    int my_group_rank;
-    ompi_group_t *group1_pointer, *group2_pointer, *new_group_pointer;
-    ompi_proc_t *proc1_pointer, *proc2_pointer, *my_proc_pointer = NULL;
-
-
-    group1_pointer=(ompi_group_t *)group1;
-    group2_pointer=(ompi_group_t *)group2;
+    int new_group_size, overlap_count, rc;
+    ompi_group_t *new_group_pointer;
+    ompi_proc_t *proc1_pointer;
+    opal_bitmap_t bitmap;
 
     /*
      * form union
      */
 
     /* get new group size */
-    new_group_size=0;
+    OBJ_CONSTRUCT(&bitmap, opal_bitmap_t);
+    rc = opal_bitmap_init (&bitmap, 32);
+    if (OPAL_SUCCESS != rc) {
+        return rc;
+    }
 
-    /* loop over group1 members */
-    for( proc1=0; proc1 < group1_pointer->grp_proc_count; proc1++ ) {
-        proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1);
-        /* check to see if this proc is in group2 */
-        found_in_group2=0;
-        for( proc2=0 ; proc2 < group2_pointer->grp_proc_count ; proc2++ ) {
-            proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2);
-            if( proc1_pointer == proc2_pointer ) {
-                found_in_group2=true;
-                break;
-            }
-        }  /* end proc1 loop */
-        if(found_in_group2) {
-            continue;
-        }
-        new_group_size++;
-    }  /* end proc loop */
+    /* check group2 elements to see if they need to be included in the list */
+    overlap_count = ompi_group_dense_overlap (group2, group1, &bitmap);
+    if (0 > overlap_count) {
+        OBJ_DESTRUCT(&bitmap);
+        return overlap_count;
+    }
 
+    new_group_size = group1->grp_proc_count - overlap_count;
     if ( 0 == new_group_size ) {
         *new_group = MPI_GROUP_EMPTY;
         OBJ_RETAIN(MPI_GROUP_EMPTY);
+        OBJ_DESTRUCT(&bitmap);
         return MPI_SUCCESS;
     }
 
     /* allocate a new ompi_group_t structure */
-    new_group_pointer=ompi_group_allocate(new_group_size);
+    new_group_pointer = ompi_group_allocate(new_group_size);
     if( NULL == new_group_pointer ) {
+        OBJ_DESTRUCT(&bitmap);
         return MPI_ERR_GROUP;
     }
 
     /* fill in group list */
-    cnt=0;
     /* loop over group1 members */
-    for( proc1=0; proc1 < group1_pointer->grp_proc_count; proc1++ ) {
-        proc1_pointer = ompi_group_peer_lookup(group1_pointer,proc1);
-        /* check to see if this proc is in group2 */
-        found_in_group2=0;
-        for( proc2=0 ; proc2 < group2_pointer->grp_proc_count ; proc2++ ) {
-            proc2_pointer = ompi_group_peer_lookup(group2_pointer,proc2);
-            if( proc1_pointer == proc2_pointer ) {
-                found_in_group2=true;
-                break;
-            }
-        }  /* end proc1 loop */
-        if(found_in_group2) {
+    for (int proc1 = 0, cnt = 0 ; proc1 < group1->grp_proc_count ; ++proc1) {
+        if (opal_bitmap_is_set_bit (&bitmap, proc1)) {
             continue;
         }
 
-        new_group_pointer->grp_proc_pointers[cnt] =
-            ompi_group_peer_lookup(group1_pointer,proc1);
-
-        cnt++;
+        proc1_pointer = ompi_group_get_proc_ptr_raw (group1, proc1);
+        new_group_pointer->grp_proc_pointers[cnt++] = proc1_pointer;
     }  /* end proc loop */
 
+    OBJ_DESTRUCT(&bitmap);
+
     /* increment proc reference counters */
     ompi_group_increment_proc_count(new_group_pointer);
 
     /* find my rank */
-    my_group_rank=group1_pointer->grp_my_rank;
-    if ( MPI_UNDEFINED != my_group_rank ) {
-        my_proc_pointer = ompi_group_peer_lookup(group1_pointer,my_group_rank);
-    }
-    else {
-        my_group_rank=group2_pointer->grp_my_rank;
-        if ( MPI_UNDEFINED != my_group_rank ) {
-            my_proc_pointer = ompi_group_peer_lookup(group2_pointer,my_group_rank);
-        }
-    }
-
-    if ( MPI_UNDEFINED == my_group_rank ) {
+    if (MPI_UNDEFINED == group1->grp_my_rank || MPI_UNDEFINED != group2->grp_my_rank) {
         new_group_pointer->grp_my_rank = MPI_UNDEFINED;
-    }
-    else {
-        ompi_set_group_rank(new_group_pointer,my_proc_pointer);
+    } else {
+        ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc);
     }
 
     *new_group = (MPI_Group)new_group_pointer;
diff --git a/ompi/group/group_set_rank.c b/ompi/group/group_set_rank.c
index 8529970ae7..16b8401743 100644
--- a/ompi/group/group_set_rank.c
+++ b/ompi/group/group_set_rank.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -10,6 +11,8 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006-2007 University of Houston. All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -38,12 +41,10 @@ void ompi_set_group_rank(ompi_group_t *group, struct ompi_proc_t *proc_pointer)
         for (proc = 0; proc < group->grp_proc_count; proc++) {
             /* check and see if this proc pointer matches proc_pointer
              */
-            if (ompi_group_peer_lookup(group,proc) == proc_pointer) {
+	    if (ompi_group_peer_lookup_existing (group, proc) == proc_pointer) {
                 group->grp_my_rank = proc;
-            }
+		break;
+	    }
         }                       /* end proc loop */
     }
-
-    /* return */
-    return;
 }
diff --git a/ompi/mca/bml/base/base.h b/ompi/mca/bml/base/base.h
index 27cd1e568c..595deaf72a 100644
--- a/ompi/mca/bml/base/base.h
+++ b/ompi/mca/bml/base/base.h
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -10,6 +11,8 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -25,6 +28,7 @@
 #include "ompi/mca/mca.h"
 #include "opal/mca/base/mca_base_framework.h"
 #include "ompi/mca/bml/bml.h"
+#include "ompi/proc/proc.h"
 
 
 /*
@@ -60,6 +64,14 @@ OMPI_DECLSPEC extern mca_bml_base_component_t mca_bml_component;
 OMPI_DECLSPEC extern mca_bml_base_module_t mca_bml;
 OMPI_DECLSPEC extern mca_base_framework_t ompi_bml_base_framework;
 
+static inline struct mca_bml_base_endpoint_t *mca_bml_base_get_endpoint (struct ompi_proc_t *proc) {
+    if (OPAL_UNLIKELY(NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML])) {
+        mca_bml.bml_add_proc (proc);
+    }
+
+    return (struct mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+}
+
 
 END_C_DECLS
 #endif /* MCA_BML_BASE_H */
diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h
index 41e8496066..df731a64a0 100644
--- a/ompi/mca/bml/bml.h
+++ b/ompi/mca/bml/bml.h
@@ -160,14 +160,11 @@ static inline bool mca_bml_base_btl_array_remove( mca_bml_base_btl_array_t* arra
  */
 static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_btl_array_t* array, size_t item_index)
 {
-#if OPAL_ENABLE_DEBUG
-    if(item_index >= array->arr_size) {
-        opal_output(0, "mca_bml_base_btl_array_get_index: invalid array index %lu >= %lu",
-                    (unsigned long)item_index, (unsigned long)array->arr_size);
-        return 0;
+    if (item_index < array->arr_size) {
+        return &array->bml_btls[item_index];
     }
-#endif
-    return &array->bml_btls[item_index];
+
+    return NULL;
 }
 
 /**
@@ -441,7 +438,7 @@ typedef int (*mca_bml_base_module_finalize_fn_t)( void );
  * @return                    OMPI_SUCCESS or error status on failure.
  *
  * The mca_bml_base_module_add_procs_fn_t() is called by the PML to
- * determine the set of BMLs that should be used to reach each process.
+ * determine the set of BTLs that should be used to reach each process.
  * Any addressing information exported by the peer via the mca_base_modex_send()
  * function should be available during this call via the corresponding
  * mca_base_modex_recv() function. The BML may utilize this information to
@@ -465,6 +462,25 @@ typedef int (*mca_bml_base_module_add_procs_fn_t)(
                                                   struct opal_bitmap_t* reachable
                                                   );
 
+/**
+ * PML->BML notification of change in the process list.
+ *
+ * @param proc (IN)           Process
+ * @return                    OMPI_SUCCESS or error status on failure.
+ *
+ * The mca_bml_base_module_add_proc_fn_t() is called by the PML to
+ * determine the set of BTLs that should be used to reach each process.
+ * Any addressing information exported by the peer via the mca_base_modex_send()
+ * function should be available during this call via the corresponding
+ * mca_base_modex_recv() function. The BML may utilize this information to
+ * determine reachability of each peer process.
+ *
+ * \note This function will return OMPI_ERR_UNREACH if the process can not
+ * be reached by a currently active BTL. This is not a fatal error, and the
+ * calling layer is free to continue using the BML interface.
+ */
+typedef int (*mca_bml_base_module_add_proc_fn_t) (struct ompi_proc_t *proc);
+
 /**
  * Notification of change to the process list.
  *
@@ -559,6 +575,7 @@ struct mca_bml_base_module_t {
     mca_bml_base_component_t* bml_component; /**< pointer back to the BML component structure */
 
     /* BML function table */
+    mca_bml_base_module_add_proc_fn_t      bml_add_proc;
     mca_bml_base_module_add_procs_fn_t     bml_add_procs;
     mca_bml_base_module_del_procs_fn_t     bml_del_procs;
     mca_bml_base_module_add_btl_fn_t       bml_add_btl;
diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c
index 345facd037..182b0da00d 100644
--- a/ompi/mca/bml/r2/bml_r2.c
+++ b/ompi/mca/bml/r2/bml_r2.c
@@ -10,7 +10,7 @@
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
- * Copyright (c) 2007-2014 Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * Copyright (c) 2008-2015 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2013      Intel, Inc. All rights reserved
@@ -144,6 +144,293 @@ static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *bt
     }
 }
 
+static mca_bml_base_endpoint_t *mca_bml_r2_allocate_endpoint (ompi_proc_t *proc) {
+    mca_bml_base_endpoint_t *bml_endpoint;
+
+    /* allocate bml specific proc data */
+    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
+    if (NULL == bml_endpoint) {
+        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
+        return NULL;
+    }
+
+    /* preallocate space in array for max number of r2s */
+    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
+    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
+    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
+    bml_endpoint->btl_max_send_size = -1;
+    bml_endpoint->btl_proc = proc;
+    proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint;
+
+    bml_endpoint->btl_flags_or = 0;
+    return bml_endpoint;
+}
+
+static void mca_bml_r2_register_progress (mca_btl_base_module_t *btl)
+{
+    if (NULL != btl->btl_component->btl_progress) {
+        bool found = false;
+
+        for (size_t p = 0 ; p < mca_bml_r2.num_btl_progress ; ++p) {
+            if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
+                found = true;
+                break;
+            }
+        }
+
+        if (found == false) {
+            mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress++] =
+                btl->btl_component->btl_progress;
+            opal_progress_register (btl->btl_component->btl_progress);
+        }
+    }
+}
+
+static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_endpoint_t *bml_endpoint,
+                                        mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *btl_endpoint)
+{
+    mca_bml_base_btl_t* bml_btl = NULL;
+    int btl_flags = btl->btl_flags;
+    bool btl_in_use = false;
+    size_t size;
+
+    /* NTH: these flags should have been sanitized by the btl. Once that is verified these
+     * checks can be safely removed. */
+    if ((btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put)) {
+        opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
+                    " the %s BTL without any PUT function attached. Discard the flag !",
+                    btl->btl_component->btl_version.mca_component_name);
+        btl_flags ^= MCA_BTL_FLAGS_PUT;
+    }
+    if ((btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get)) {
+        opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
+                    " the %s BTL without any GET function attached. Discard the flag !",
+                    btl->btl_component->btl_version.mca_component_name);
+        btl_flags ^= MCA_BTL_FLAGS_GET;
+    }
+
+    if ((btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0) {
+        /* If no protocol specified, we have 2 choices: we ignore the BTL
+         * as we don't know which protocl to use, or we suppose that all
+         * BTLs support the send protocol. This is really a btl error as
+         * these flags should have been sanitized by the btl. */
+        btl_flags |= MCA_BTL_FLAGS_SEND;
+    }
+
+    if (btl_flags & MCA_BTL_FLAGS_SEND) {
+        /* dont allow an additional BTL with a lower exclusivity ranking */
+        bml_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_send, size - 1);
+        size = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send);
+
+        if (!bml_btl || bml_btl->btl->btl_exclusivity < btl->btl_exclusivity) {
+            /* this btl has higher exclusivity than an existing btl or none exists */
+
+            opal_output_verbose(1, opal_btl_base_framework.framework_output,
+                                "mca: bml: Using %s btl for send to %s on node %s",
+                                btl->btl_component->btl_version.mca_component_name,
+                                OMPI_NAME_PRINT(&proc->super.proc_name),
+                                proc->super.proc_hostname);
+
+            /* cache the endpoint on the proc */
+            if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
+                bml_btl = mca_bml_base_btl_array_insert (&bml_endpoint->btl_send);
+                bml_btl->btl = btl;
+                bml_btl->btl_endpoint = btl_endpoint;
+                bml_btl->btl_weight = 0;
+                bml_btl->btl_flags = btl_flags;
+
+                /**
+                 * calculate the bitwise OR of the btl flags
+                 */
+                bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
+            } else {
+                opal_output_verbose(20, opal_btl_base_framework.framework_output,
+                                    "mca: bml: Not using %s btl for send to %s on node %s "
+                                    "because %s btl has higher exclusivity (%d > %d)",
+                                    btl->btl_component->btl_version.mca_component_name,
+                                    OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
+                                    bml_btl->btl->btl_component->btl_version.mca_component_name,
+                                    bml_btl->btl->btl_exclusivity,
+                                    btl->btl_exclusivity);
+            }
+
+            btl_in_use = true;
+        }
+    }
+
+    /* always add rdma endpoints */
+    if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
+        !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
+          (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
+        mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
+
+        bml_btl_rdma->btl = btl;
+        bml_btl_rdma->btl_endpoint = btl_endpoint;
+        bml_btl_rdma->btl_weight = 0;
+        bml_btl_rdma->btl_flags = btl_flags;
+
+        if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
+            bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
+        }
+
+        if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
+            bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
+        }
+
+        btl_in_use = true;
+    }
+
+    return btl_in_use ? OMPI_SUCCESS : OMPI_ERR_NOT_AVAILABLE;
+}
+
+static void mca_bml_r2_compute_endpoint_metrics (mca_bml_base_endpoint_t *bml_endpoint)
+{
+    double total_bandwidth = 0;
+    uint32_t latency;
+    size_t n_send, n_rdma;
+
+    /* (1) determine the total bandwidth available across all btls
+     *     note that we need to do this here, as we may already have btls configured
+     * (2) determine the highest priority ranking for latency
+     * (3) compute the maximum amount of bytes that can be send without any
+     *     weighting. Once the left over is smaller than this number we will
+     *     start using the weight to compute the correct amount.
+     */
+    n_send = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_send);
+    n_rdma = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
+
+    /* sort BTLs in descending order according to bandwidth value */
+    qsort (bml_endpoint->btl_send.bml_btls, n_send,
+           sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
+
+    bml_endpoint->btl_rdma_index = 0;
+
+    mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
+
+    /* (1) set the weight of each btl as a percentage of overall bandwidth
+     * (2) copy all btl instances at the highest priority ranking into the
+     *     list of btls used for first fragments
+     */
+    for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
+        mca_bml_base_btl_t *bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
+        mca_btl_base_module_t *btl = bml_btl->btl;
+
+        /* compute weighting factor for this r2 */
+        if(btl->btl_bandwidth > 0) {
+            bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
+        } else {
+            bml_btl->btl_weight = (float)(1.0 / n_send);
+        }
+
+        /* check to see if this r2 is already in the array of r2s
+         * used for first fragments - if not add it.
+         */
+        if(btl->btl_latency == latency) {
+            mca_bml_base_btl_t* bml_btl_new =
+                mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
+            *bml_btl_new = *bml_btl;
+        }
+
+        /* set endpoint max send size as min of available btls */
+        if (bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
+            bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
+    }
+
+    /* sort BTLs in descending order according to bandwidth value */
+    qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
+          sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
+
+    mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
+
+    /* set rdma btl weights */
+    for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
+        mca_bml_base_btl_t *bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
+
+        /* compute weighting factor for this r2 */
+        if (bml_btl->btl->btl_bandwidth > 0.0) {
+            bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
+        } else {
+            bml_btl->btl_weight = (float)(1.0 / n_rdma);
+        }
+    }
+}
+
+static int mca_bml_r2_add_proc (struct ompi_proc_t *proc)
+{
+    mca_bml_base_endpoint_t *bml_endpoint;
+    /* at least one btl is in use */
+    bool btl_in_use;
+    int rc;
+
+    if (OPAL_UNLIKELY(NULL == proc)) {
+        return OMPI_ERR_BAD_PARAM;
+    }
+
+    /* check if this endpoint is already set up */
+    if (NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
+        OBJ_RETAIN(proc);
+        return OMPI_SUCCESS;
+    }
+
+    /* add btls if not already done */
+    if (OMPI_SUCCESS != (rc = mca_bml_r2_add_btls())) {
+        return rc;
+    }
+
+    bml_endpoint = mca_bml_r2_allocate_endpoint (proc);
+    if (OPAL_UNLIKELY(NULL == bml_endpoint)) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    for (int p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) {
+        mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index];
+        struct mca_btl_base_endpoint_t *btl_endpoint = NULL;
+
+        /* if the r2 can reach the destination proc it sets the
+         * corresponding bit (proc index) in the reachable bitmap
+         * and can return addressing information for each proc
+         * that is passed back to the r2 on data transfer calls
+         */
+        rc = btl->btl_add_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint, NULL);
+        if (OMPI_SUCCESS != rc || NULL == btl_endpoint) {
+            /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL
+             * can take care of this task. */
+            continue;
+        }
+
+        rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoint);
+        if (OMPI_SUCCESS != rc) {
+            btl->btl_del_procs (btl, 1, (opal_proc_t **) &proc, &btl_endpoint);
+        } else {
+            mca_bml_r2_register_progress (btl);
+            btl_in_use = true;
+        }
+    }
+
+    if (!btl_in_use) {
+        /* no btl is available for this proc */
+        if (mca_bml_r2.show_unreach_errors) {
+            opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
+                            OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
+                            (NULL != ompi_proc_local_proc->super.proc_hostname ?
+                             ompi_proc_local_proc->super.proc_hostname : "unknown!"),
+                            OMPI_NAME_PRINT(&(proc->super.proc_name)),
+                            (NULL != proc->super.proc_hostname ?
+                             proc->super.proc_hostname : "unknown!"),
+                            btl_names);
+        }
+
+        return OMPI_ERR_UNREACH;
+    }
+
+    /* compute metrics for registered btls */
+    mca_bml_r2_compute_endpoint_metrics (bml_endpoint);
+
+    return OMPI_SUCCESS;
+}
+
 /*
  *   For each proc setup a datastructure that indicates the BTLs
  *   that can be used to reach the destination.
@@ -154,7 +441,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
                                  struct ompi_proc_t** procs,
                                  struct opal_bitmap_t* reachable )
 {
-    size_t p, p_index, n_new_procs = 0;
+    size_t n_new_procs = 0;
     struct mca_btl_base_endpoint_t ** btl_endpoints = NULL;
     struct ompi_proc_t** new_procs = NULL;
     int rc, ret = OMPI_SUCCESS;
@@ -170,7 +457,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
     /* Select only the procs that don't yet have the BML proc struct. This prevent
      * us from calling btl->add_procs several times on the same destination proc.
      */
-    for(p_index = 0; p_index < nprocs; p_index++) {
+    for (size_t p_index = 0 ; p_index < nprocs ; ++p_index) {
         struct ompi_proc_t* proc = procs[p_index];
 
         if(NULL !=  proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
@@ -203,10 +490,9 @@ static int mca_bml_r2_add_procs( size_t nprocs,
         return OMPI_ERR_OUT_OF_RESOURCE;
     }
 
-    for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
-        mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
+    for (size_t p_index = 0 ; p_index < mca_bml_r2.num_btl_modules ; ++p_index) {
+        mca_btl_base_module_t *btl = mca_bml_r2.btl_modules[p_index];
         int btl_inuse = 0;
-        int btl_flags;
 
         /* if the r2 can reach the destination proc it sets the
          * corresponding bit (proc index) in the reachable bitmap
@@ -217,240 +503,69 @@ static int mca_bml_r2_add_procs( size_t nprocs,
         memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*));
 
         rc = btl->btl_add_procs(btl, n_new_procs, (opal_proc_t**)new_procs, btl_endpoints, reachable);
-        if(OMPI_SUCCESS != rc) {
-            /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL
-             * can take care of this task.
-             */
+        if (OMPI_SUCCESS != rc) {
+            /* This BTL encountered an error while adding procs. Continue in case some other
+             * BTL(s) can be used. */
             continue;
         }
 
         /* for each proc that is reachable */
-        for( p = 0; p < n_new_procs; p++ ) {
-            if(opal_bitmap_is_set_bit(reachable, p)) {
-                ompi_proc_t *proc = new_procs[p];
-                mca_bml_base_endpoint_t * bml_endpoint =
-                    (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-                mca_bml_base_btl_t* bml_btl = NULL;
-                size_t size;
-
-                if(NULL == bml_endpoint) {
-                    /* allocate bml specific proc data */
-                    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
-                    if (NULL == bml_endpoint) {
-                        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
-                        free(btl_endpoints);
-                        free(new_procs);
-                        return OMPI_ERR_OUT_OF_RESOURCE;
-                    }
-
-                    /* preallocate space in array for max number of r2s */
-                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
-                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
-                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
-                    bml_endpoint->btl_max_send_size = -1;
-                    bml_endpoint->btl_proc = proc;
-                    proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint;
-
-                    bml_endpoint->btl_flags_or = 0;
-                }
-
-                btl_flags = btl->btl_flags;
-                if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
-                    opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
-                                " the %s BTL without any PUT function attached. Discard the flag !",
-                                btl->btl_component->btl_version.mca_component_name);
-                    btl_flags ^= MCA_BTL_FLAGS_PUT;
-                }
-                if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
-                    opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
-                                " the %s BTL without any GET function attached. Discard the flag !",
-                                btl->btl_component->btl_version.mca_component_name);
-                    btl_flags ^= MCA_BTL_FLAGS_GET;
-                }
-
-                if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
-                    /**
-                     * If no protocol specified, we have 2 choices: we ignore the BTL
-                     * as we don't know which protocl to use, or we suppose that all
-                     * BTLs support the send protocol.
-                     */
-                    btl_flags |= MCA_BTL_FLAGS_SEND;
-                }
-
-                /* dont allow an additional BTL with a lower exclusivity ranking */
-                size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
-                if(size > 0) {
-                    bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
-                    /* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */
-                    if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity  && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) {
-                        btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
-                        opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_bml_base_framework.framework_output,
-                                            "mca: bml: Not using %s btl to %s on node %s "
-                                            "because %s btl has higher exclusivity (%d > %d)",
-                                            btl->btl_component->btl_version.mca_component_name,
-                                            OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
-                                            bml_btl->btl->btl_component->btl_version.mca_component_name,
-                                            bml_btl->btl->btl_exclusivity,
-                                            btl->btl_exclusivity);
-                        continue;
-                    }
-                }
-                opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_bml_base_framework.framework_output,
-                                    "mca: bml: Using %s btl to %s on node %s",
-                                    btl->btl_component->btl_version.mca_component_name,
-                                    OMPI_NAME_PRINT(&proc->super.proc_name),
-                                    proc->super.proc_hostname);
-
-                /* cache the endpoint on the proc */
-                if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
-                    bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
-                    bml_btl->btl = btl;
-                    bml_btl->btl_endpoint = btl_endpoints[p];
-                    bml_btl->btl_weight = 0;
-                    bml_btl->btl_flags = btl_flags;
-
-                    /**
-                     * calculate the bitwise OR of the btl flags
-                     */
-                    bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
-                }
-
-                /* always add rdma endpoints */
-                if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
-                    !((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
-                      (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
-                    mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
-
-                    bml_btl_rdma->btl = btl;
-                    bml_btl_rdma->btl_endpoint = btl_endpoints[p];
-                    bml_btl_rdma->btl_weight = 0;
-                    bml_btl_rdma->btl_flags = btl_flags;
-
-                    if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
-                        bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
-                    }
-
-                    if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
-                        bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
-                    }
-                }
-
-                /* This BTL is in use, allow the progress registration */
-                btl_inuse++;
+        for (size_t p = 0 ; p < n_new_procs ; ++p) {
+            if (!opal_bitmap_is_set_bit(reachable, p)) {
+                continue;
             }
+
+            ompi_proc_t *proc = new_procs[p];
+            mca_bml_base_endpoint_t *bml_endpoint =
+                (mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+            mca_bml_base_btl_t *bml_btl = NULL;
+            size_t size;
+
+            if (NULL == bml_endpoint) {
+                bml_endpoint = mca_bml_r2_allocate_endpoint (proc);
+                if (NULL == bml_endpoint) {
+                    free(btl_endpoints);
+                    free(new_procs);
+                    return OPAL_ERR_OUT_OF_RESOURCE;
+                }
+            }
+
+            rc = mca_bml_r2_endpoint_add_btl (proc, bml_endpoint, btl, btl_endpoints[p]);
+            if (OMPI_SUCCESS != rc) {
+                btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
+                continue;
+            }
+
+            /* This BTL is in use, allow the progress registration */
+            btl_inuse++;
         }
 
-        if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
-            size_t p;
-            bool found = false;
-            for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) {
-                if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
-                    found = true;
-                    break;
-                }
-            }
-            if(found == false) {
-                mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] =
-                    btl->btl_component->btl_progress;
-                mca_bml_r2.num_btl_progress++;
-                opal_progress_register( btl->btl_component->btl_progress );
-            }
+        if (btl_inuse) {
+            mca_bml_r2_register_progress (btl);
         }
     }
+
     free(btl_endpoints);
 
     /* iterate back through procs and compute metrics for registered r2s */
-    for(p=0; p<n_new_procs; p++) {
-        ompi_proc_t *proc = new_procs[p];
-        mca_bml_base_endpoint_t* bml_endpoint =
-            (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-        double total_bandwidth = 0;
-        uint32_t latency;
-        size_t n_send, n_rdma;
+    for (size_t p = 0; p < n_new_procs ; ++p) {
+        mca_bml_base_endpoint_t *bml_endpoint =
+            (mca_bml_base_endpoint_t *) new_procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
 
         /* skip over procs w/ no btl's registered */
-        if(NULL == bml_endpoint) {
-            continue;
-        }
-
-        /* (1) determine the total bandwidth available across all btls
-         *     note that we need to do this here, as we may already have btls configured
-         * (2) determine the highest priority ranking for latency
-         * (3) compute the maximum amount of bytes that can be send without any
-         *     weighting. Once the left over is smaller than this number we will
-         *     start using the weight to compute the correct amount.
-         */
-        n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
-        n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
-
-        /* sort BTLs in descending order according to bandwidth value */
-        qsort(bml_endpoint->btl_send.bml_btls, n_send,
-                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
-
-        bml_endpoint->btl_rdma_index = 0;
-
-        mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
-
-        /* (1) set the weight of each btl as a percentage of overall bandwidth
-         * (2) copy all btl instances at the highest priority ranking into the
-         *     list of btls used for first fragments
-         */
-        for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
-            mca_bml_base_btl_t* bml_btl =
-                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
-            mca_btl_base_module_t *btl = bml_btl->btl;
-
-            /* compute weighting factor for this r2 */
-            if(btl->btl_bandwidth > 0) {
-                bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
-            } else {
-                bml_btl->btl_weight = (float)(1.0 / n_send);
-            }
-
-            /* check to see if this r2 is already in the array of r2s
-             * used for first fragments - if not add it.
-             */
-            if(btl->btl_latency == latency) {
-                mca_bml_base_btl_t* bml_btl_new =
-                    mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
-                *bml_btl_new = *bml_btl;
-            }
-
-            /* set endpoint max send size as min of available btls */
-            if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
-               bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
-        }
-
-        /* sort BTLs in descending order according to bandwidth value */
-        qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
-                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
-
-        mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
-
-        /* set rdma btl weights */
-        for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
-            mca_bml_base_btl_t *bml_btl =
-                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
-
-            /* compute weighting factor for this r2 */
-            if (bml_btl->btl->btl_bandwidth > 0.0) {
-                bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
-            } else {
-                bml_btl->btl_weight = (float)(1.0 / n_rdma);
-            }
+        if (NULL != bml_endpoint) {
+            mca_bml_r2_compute_endpoint_metrics (bml_endpoint);
         }
     }
 
     /* see if we have a connection to everyone else */
-    for(p = 0; p < n_new_procs; p++) {
+    for(size_t p = 0; p < n_new_procs ; ++p) {
         ompi_proc_t *proc = new_procs[p];
 
         if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
             ret = OMPI_ERR_UNREACH;
             if (mca_bml_r2.show_unreach_errors) {
-                opal_show_help("help-mca-bml-r2.txt",
-                               "unreachable proc",
-                               true,
+                opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
                                OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
                                (NULL != ompi_proc_local_proc->super.proc_hostname ?
                                 ompi_proc_local_proc->super.proc_hostname : "unknown!"),
@@ -459,6 +574,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
                                 proc->super.proc_hostname : "unknown!"),
                                btl_names);
             }
+
             break;
         }
     }
@@ -476,7 +592,6 @@ static int mca_bml_r2_add_procs( size_t nprocs,
 static int mca_bml_r2_del_procs(size_t nprocs,
                                 struct ompi_proc_t** procs)
 {
-    size_t p;
     int rc;
     struct ompi_proc_t** del_procs = (struct ompi_proc_t**)
         malloc(nprocs * sizeof(struct ompi_proc_t*));
@@ -486,26 +601,27 @@ static int mca_bml_r2_del_procs(size_t nprocs,
         return OMPI_ERR_OUT_OF_RESOURCE;
     }
 
-    for(p = 0; p < nprocs; p++) {
+    for (size_t p = 0 ; p < nprocs ; ++p) {
         ompi_proc_t *proc = procs[p];
         /* We much check that there are 2 references to the proc (not 1). The
          * first reference belongs to ompi/proc the second belongs to the bml
          * since we retained it. We will release that reference at the end of
          * the loop below. */
-        if(((opal_object_t*)proc)->obj_reference_count == 2) {
+        if (((opal_object_t*)proc)->obj_reference_count == 2 &&
+            NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
             del_procs[n_del_procs++] = proc;
         }
     }
 
-    for(p = 0; p < n_del_procs; p++) {
+    for (size_t p = 0 ; p < n_del_procs ; ++p) {
         ompi_proc_t *proc = del_procs[p];
         mca_bml_base_endpoint_t* bml_endpoint =
             (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-        size_t f_index, f_size;
+        size_t f_size;
 
         /* notify each btl that the proc is going away */
         f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
-        for(f_index = 0; f_index < f_size; f_index++) {
+        for (size_t f_index = 0 ; f_index < f_size ; ++f_index) {
             mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, f_index);
             mca_btl_base_module_t* btl = bml_btl->btl;
 
@@ -521,10 +637,12 @@ static int mca_bml_r2_del_procs(size_t nprocs,
              */
         }
 
+        proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
+
         OBJ_RELEASE(proc);
+
         /* do any required cleanup */
         OBJ_RELEASE(bml_endpoint);
-        proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
     }
     free(del_procs);
 
@@ -835,6 +953,7 @@ int mca_bml_r2_component_fini(void)
 mca_bml_r2_module_t mca_bml_r2 = {
     .super = {
         .bml_component = &mca_bml_r2_component,
+        .bml_add_proc = mca_bml_r2_add_proc,
         .bml_add_procs = mca_bml_r2_add_procs,
         .bml_del_procs = mca_bml_r2_del_procs,
         .bml_add_btl = mca_bml_r2_add_btl,
@@ -843,8 +962,7 @@ mca_bml_r2_module_t mca_bml_r2 = {
         .bml_register = mca_bml_r2_register,
         .bml_register_error = mca_bml_r2_register_error,
         .bml_finalize = mca_bml_r2_finalize,
-        .bml_ft_event = mca_bml_r2_ft_event
-    }
-
+        .bml_ft_event = mca_bml_r2_ft_event,
+    },
 };
 
diff --git a/ompi/mca/coll/fca/coll_fca_module.c b/ompi/mca/coll/fca/coll_fca_module.c
index 2c3922cf34..cda756dfa5 100644
--- a/ompi/mca/coll/fca/coll_fca_module.c
+++ b/ompi/mca/coll/fca/coll_fca_module.c
@@ -35,25 +35,6 @@ int mca_coll_fca_init_query(bool enable_progress_threads,
     return OMPI_SUCCESS;
 }
 
-static int have_remote_peers(ompi_group_t *group, size_t size, int *local_peers)
-{
-    ompi_proc_t *proc;
-    size_t i;
-    int ret;
-
-    *local_peers = 0;
-    ret = 0;
-    for (i = 0; i < size; ++i) {
-        proc = ompi_group_peer_lookup(group, i);
-        if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
-            ++*local_peers;
-        } else {
-            ret = 1;
-        }
-    }
-    return ret;
-}
-
 static inline ompi_proc_t* __local_rank_lookup(ompi_communicator_t *comm, int rank)
 {
     return ompi_group_peer_lookup(comm->c_local_group, rank);
@@ -618,7 +599,7 @@ mca_coll_fca_comm_query(struct ompi_communicator_t *comm, int *priority)
     if (size < mca_coll_fca_component.fca_np)
         goto exit;
 
-    if (!have_remote_peers(comm->c_local_group, size, &local_peers) || OMPI_COMM_IS_INTER(comm))
+    if (!ompi_group_have_remote_peers(comm->c_local_group) || OMPI_COMM_IS_INTER(comm))
         goto exit;
 
     fca_module = OBJ_NEW(mca_coll_fca_module_t);
diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c
index 37a7cbdc2d..4739217bc1 100644
--- a/ompi/mca/coll/sm/coll_sm_module.c
+++ b/ompi/mca/coll/sm/coll_sm_module.c
@@ -74,7 +74,6 @@ uint32_t mca_coll_sm_one = 1;
  */
 static int sm_module_enable(mca_coll_base_module_t *module,
                           struct ompi_communicator_t *comm);
-static bool have_local_peers(ompi_group_t *group, size_t size);
 static int bootstrap_comm(ompi_communicator_t *comm,
                           mca_coll_sm_module_t *module);
 static int mca_coll_sm_module_disable(mca_coll_base_module_t *module,
@@ -172,8 +171,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority)
     /* If we're intercomm, or if there's only one process in the
        communicator, or if not all the processes in the communicator
        are not on this node, then we don't want to run */
-    if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) ||
-        !have_local_peers(comm->c_local_group, ompi_comm_size(comm))) {
+    if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) {
         opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                             "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name);
 	return NULL;
@@ -490,23 +488,6 @@ int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
     return OMPI_SUCCESS;
 }
 
-
-static bool have_local_peers(ompi_group_t *group, size_t size)
-{
-    size_t i;
-    ompi_proc_t *proc;
-
-    for (i = 0; i < size; ++i) {
-        proc = ompi_group_peer_lookup(group,i);
-        if (!OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-
 static int bootstrap_comm(ompi_communicator_t *comm,
                           mca_coll_sm_module_t *module)
 {
diff --git a/ompi/mca/mtl/psm/mtl_psm.h b/ompi/mca/mtl/psm/mtl_psm.h
index 36aedbfcc5..52a590b3d3 100644
--- a/ompi/mca/mtl/psm/mtl_psm.h
+++ b/ompi/mca/mtl/psm/mtl_psm.h
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -10,6 +11,8 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006      QLogic Corporation. All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -23,6 +26,7 @@
 #include "ompi/mca/pml/pml.h"
 #include "ompi/mca/mtl/mtl.h"
 #include "ompi/mca/mtl/base/base.h"
+#include "ompi/proc/proc.h"
 #include "opal/datatype/opal_convertor.h"
 #include <psm.h>
 #include <psm_mq.h>
diff --git a/ompi/mca/mtl/psm/mtl_psm_endpoint.h b/ompi/mca/mtl/psm/mtl_psm_endpoint.h
index 83a1ecfa8f..b08e9fdbc4 100644
--- a/ompi/mca/mtl/psm/mtl_psm_endpoint.h
+++ b/ompi/mca/mtl/psm/mtl_psm_endpoint.h
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -10,6 +11,8 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006      QLogic Corporation. All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -54,5 +57,14 @@ struct mca_mtl_psm_endpoint_t {
 typedef struct mca_mtl_psm_endpoint_t  mca_mtl_psm_endpoint_t;
 OBJ_CLASS_DECLARATION(mca_mtl_psm_endpoint);
 
+static inline mca_mtl_psm_endpoint_t *ompi_mtl_psm_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
+{
+    if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
+	ompi_mtl_psm_add_procs (mtl, 1, &ompi_proc);
+    }
+
+    return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
+}
+
 END_C_DECLS
 #endif
diff --git a/ompi/mca/mtl/psm/mtl_psm_send.c b/ompi/mca/mtl/psm/mtl_psm_send.c
index ddedd65265..c30801b1fb 100644
--- a/ompi/mca/mtl/psm/mtl_psm_send.c
+++ b/ompi/mca/mtl/psm/mtl_psm_send.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -10,6 +11,8 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006      QLogic Corporation. All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -42,7 +45,7 @@ ompi_mtl_psm_send(struct mca_mtl_base_module_t* mtl,
     int ret;
     size_t length;
     ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest );
-    mca_mtl_psm_endpoint_t* psm_endpoint = (mca_mtl_psm_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
+    mca_mtl_psm_endpoint_t* psm_endpoint = ompi_mtl_psm_get_endpoint (mtl, ompi_proc);
 
     assert(mtl == &ompi_mtl_psm.super);
 
@@ -94,7 +97,7 @@ ompi_mtl_psm_isend(struct mca_mtl_base_module_t* mtl,
     mca_mtl_psm_request_t * mtl_psm_request = (mca_mtl_psm_request_t*) mtl_request;
     size_t length;
     ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest );
-    mca_mtl_psm_endpoint_t* psm_endpoint = (mca_mtl_psm_endpoint_t*)ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
+    mca_mtl_psm_endpoint_t* psm_endpoint = ompi_mtl_psm_get_endpoint (mtl, ompi_proc);
 
     assert(mtl == &ompi_mtl_psm.super);
 
diff --git a/ompi/mca/mtl/psm2/mtl_psm2.h b/ompi/mca/mtl/psm2/mtl_psm2.h
index b48e07a039..44152656bf 100644
--- a/ompi/mca/mtl/psm2/mtl_psm2.h
+++ b/ompi/mca/mtl/psm2/mtl_psm2.h
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -11,6 +12,8 @@
  *                         All rights reserved.
  * Copyright (c) 2006      QLogic Corporation. All rights reserved.
  * Copyright (c) 2015      Intel, Inc. All rights reserved
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -24,6 +27,7 @@
 #include "ompi/mca/pml/pml.h"
 #include "ompi/mca/mtl/mtl.h"
 #include "ompi/mca/mtl/base/base.h"
+#include "ompi/proc/proc.h"
 #include "opal/datatype/opal_convertor.h"
 #include <psm2.h>
 #include <psm2_mq.h>
diff --git a/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h b/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h
index e3233db352..aeb6bccadc 100644
--- a/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h
+++ b/ompi/mca/mtl/psm2/mtl_psm2_endpoint.h
@@ -55,5 +55,14 @@ struct mca_mtl_psm2_endpoint_t {
 typedef struct mca_mtl_psm2_endpoint_t  mca_mtl_psm2_endpoint_t;
 OBJ_CLASS_DECLARATION(mca_mtl_psm2_endpoint);
 
+static inline mca_mtl_psm_endpoint_t *ompi_mtl_psm2_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
+{
+    if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
+	ompi_mtl_psm2_add_procs (mtl, 1, &ompi_proc);
+    }
+
+    return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
+}
+
 END_C_DECLS
 #endif
diff --git a/ompi/mca/mtl/psm2/mtl_psm2_send.c b/ompi/mca/mtl/psm2/mtl_psm2_send.c
index 76fb5a1cd0..73cf769873 100644
--- a/ompi/mca/mtl/psm2/mtl_psm2_send.c
+++ b/ompi/mca/mtl/psm2/mtl_psm2_send.c
@@ -43,7 +43,7 @@ ompi_mtl_psm2_send(struct mca_mtl_base_module_t* mtl,
     int ret;
     size_t length;
     ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest );
-    mca_mtl_psm2_endpoint_t* psm_endpoint = (mca_mtl_psm2_endpoint_t*) ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
+    mca_mtl_psm2_endpoint_t* psm_endpoint = ompi_mtl_psm2_get_endpoint (mtl, ompi_proc);
 
     assert(mtl == &ompi_mtl_psm2.super);
 
@@ -95,7 +95,7 @@ ompi_mtl_psm2_isend(struct mca_mtl_base_module_t* mtl,
     mca_mtl_psm2_request_t * mtl_psm2_request = (mca_mtl_psm2_request_t*) mtl_request;
     size_t length;
     ompi_proc_t* ompi_proc = ompi_comm_peer_lookup( comm, dest );
-    mca_mtl_psm2_endpoint_t* psm_endpoint = (mca_mtl_psm2_endpoint_t*)ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
+    mca_mtl_psm2_endpoint_t* psm_endpoint = ompi_mtl_psm2_get_endpoint (mtl, ompi_proc);
 
     assert(mtl == &ompi_mtl_psm2.super);
 
diff --git a/ompi/mca/osc/portals4/osc_portals4.h b/ompi/mca/osc/portals4/osc_portals4.h
index c403683627..fcba31ffad 100644
--- a/ompi/mca/osc/portals4/osc_portals4.h
+++ b/ompi/mca/osc/portals4/osc_portals4.h
@@ -299,7 +299,7 @@ ompi_osc_portals4_get_peer(ompi_osc_portals4_module_t *module, int rank)
 static inline ptl_process_t
 ompi_osc_portals4_get_peer_group(struct ompi_group_t *group, int rank)
 {
-    ompi_proc_t *proc = ompi_group_get_proc_ptr(group, rank);
+    ompi_proc_t *proc = ompi_group_get_proc_ptr(group, rank, true);
     return *((ptl_process_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]);
 }
 
diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c
index b51e7d1c92..c8c940d535 100644
--- a/ompi/mca/osc/sm/osc_sm_component.c
+++ b/ompi/mca/osc/sm/osc_sm_component.c
@@ -134,10 +134,8 @@ check_win_ok(ompi_communicator_t *comm, int flavor)
         return OMPI_ERR_NOT_SUPPORTED;
     }
 
-    for (i = 0 ; i < ompi_comm_size(comm) ; ++i) {
-        if (!OPAL_PROC_ON_LOCAL_NODE(ompi_comm_peer_lookup(comm, i)->super.proc_flags)) {
-            return OMPI_ERR_RMA_SHARED;
-        }
+    if (ompi_group_have_remote_peers (comm->c_local_group)) {
+        return OMPI_ERR_RMA_SHARED;
     }
 
     return OMPI_SUCCESS;
diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c
index cee5cd3756..55de7d150f 100644
--- a/ompi/mca/pml/ob1/pml_ob1.c
+++ b/ompi/mca/pml/ob1/pml_ob1.c
@@ -191,11 +191,9 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
 {
     /* allocate pml specific comm data */
     mca_pml_ob1_comm_t* pml_comm = OBJ_NEW(mca_pml_ob1_comm_t);
-    opal_list_item_t *item, *next_item;
-    mca_pml_ob1_recv_frag_t* frag;
+    mca_pml_ob1_recv_frag_t *frag, *next_frag;
     mca_pml_ob1_comm_proc_t* pml_proc;
     mca_pml_ob1_match_hdr_t* hdr;
-    int i;
 
     if (NULL == pml_comm) {
         return OMPI_ERR_OUT_OF_RESOURCE;
@@ -210,16 +208,8 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
     mca_pml_ob1_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count);
     comm->c_pml_comm = pml_comm;
 
-    for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
-        pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i);
-        OBJ_RETAIN(pml_comm->procs[i].ompi_proc);
-    }
     /* Grab all related messages from the non_existing_communicator pending queue */
-    for( item = opal_list_get_first(&mca_pml_ob1.non_existing_communicator_pending);
-         item != opal_list_get_end(&mca_pml_ob1.non_existing_communicator_pending);
-         item = next_item ) {
-        frag = (mca_pml_ob1_recv_frag_t*)item;
-        next_item = opal_list_get_next(item);
+    OPAL_LIST_FOREACH_SAFE(frag, next_frag, &mca_pml_ob1.non_existing_communicator_pending, mca_pml_ob1_recv_frag_t) {
         hdr = &frag->hdr.hdr_match;
 
         /* Is this fragment for the current communicator ? */
@@ -229,8 +219,8 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
         /* As we now know we work on a fragment for this communicator
          * we should remove it from the
          * non_existing_communicator_pending list. */
-        opal_list_remove_item( &mca_pml_ob1.non_existing_communicator_pending,
-                               item );
+        opal_list_remove_item (&mca_pml_ob1.non_existing_communicator_pending,
+                               (opal_list_item_t *) frag);
 
       add_fragment_to_unexpected:
 
@@ -249,7 +239,7 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
          * We just have to push the fragment into the unexpected list of the corresponding
          * proc, or into the out-of-order (cant_match) list.
          */
-        pml_proc = &(pml_comm->procs[hdr->hdr_src]);
+        pml_proc = mca_pml_ob1_peer_lookup(comm, hdr->hdr_src);
 
         if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
             /* We're now expecting the next sequence number. */
@@ -283,12 +273,6 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
 
 int mca_pml_ob1_del_comm(ompi_communicator_t* comm)
 {
-    mca_pml_ob1_comm_t* pml_comm = comm->c_pml_comm;
-    int i;
-
-    for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
-        OBJ_RELEASE(pml_comm->procs[i].ompi_proc);
-    }
     OBJ_RELEASE(comm->c_pml_comm);
     comm->c_pml_comm = NULL;
     return OMPI_SUCCESS;
@@ -303,9 +287,9 @@ int mca_pml_ob1_del_comm(ompi_communicator_t* comm)
 
 int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
 {
+    mca_btl_base_selected_module_t *sm;
     opal_bitmap_t reachable;
     int rc;
-    opal_list_item_t *item;
 
     if(nprocs == 0)
         return OMPI_SUCCESS;
@@ -347,11 +331,7 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
        BTLs requires iterating over the procs, as the BML does not
        expose all currently in use btls. */
 
-    for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
-         item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
-         item = opal_list_get_next(item)) {
-        mca_btl_base_selected_module_t *sm =
-            (mca_btl_base_selected_module_t*) item;
+    OPAL_LIST_FOREACH(sm, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
         if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
             opal_show_help("help-mpi-pml-ob1.txt", "eager_limit_too_small",
                            true,
@@ -589,13 +569,19 @@ int mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose)
 
     /* iterate through all procs on communicator */
     for( i = 0; i < (int)pml_comm->num_procs; i++ ) {
-        mca_pml_ob1_comm_proc_t* proc = &pml_comm->procs[i];
+        mca_pml_ob1_comm_proc_t* proc = pml_comm->procs[i];
+
+        if (NULL == proc) {
+            continue;
+        }
+
         mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
         size_t n;
 
         opal_output(0, "[Rank %d] expected_seq %d ompi_proc %p send_seq %d\n",
                     i, proc->expected_sequence, (void*) proc->ompi_proc,
                     proc->send_sequence);
+
         /* dump all receive queues */
         if( opal_list_get_size(&proc->specific_receives) ) {
             opal_output(0, "expected specific receives\n");
diff --git a/ompi/mca/pml/ob1/pml_ob1_comm.c b/ompi/mca/pml/ob1/pml_ob1_comm.c
index d66bebef5d..9ab64c4614 100644
--- a/ompi/mca/pml/ob1/pml_ob1_comm.c
+++ b/ompi/mca/pml/ob1/pml_ob1_comm.c
@@ -40,14 +40,15 @@ static void mca_pml_ob1_comm_proc_destruct(mca_pml_ob1_comm_proc_t* proc)
     OBJ_DESTRUCT(&proc->frags_cant_match);
     OBJ_DESTRUCT(&proc->specific_receives);
     OBJ_DESTRUCT(&proc->unexpected_frags);
+    if (proc->ompi_proc) {
+        OBJ_RELEASE(proc->ompi_proc);
+    }
 }
 
 
-static OBJ_CLASS_INSTANCE(
-    mca_pml_ob1_comm_proc_t,
-    opal_object_t,
-    mca_pml_ob1_comm_proc_construct,
-    mca_pml_ob1_comm_proc_destruct);
+OBJ_CLASS_INSTANCE(mca_pml_ob1_comm_proc_t, opal_object_t,
+                   mca_pml_ob1_comm_proc_construct,
+                   mca_pml_ob1_comm_proc_destruct);
 
 
 static void mca_pml_ob1_comm_construct(mca_pml_ob1_comm_t* comm)
@@ -63,11 +64,16 @@ static void mca_pml_ob1_comm_construct(mca_pml_ob1_comm_t* comm)
 
 static void mca_pml_ob1_comm_destruct(mca_pml_ob1_comm_t* comm)
 {
-    size_t i;
-    for(i=0; i<comm->num_procs; i++)
-        OBJ_DESTRUCT((&comm->procs[i]));
-    if(NULL != comm->procs)
+    if (NULL != comm->procs) {
+        for (size_t i = 0; i < comm->num_procs; ++i) {
+            if (comm->procs[i]) {
+                OBJ_RELEASE(comm->procs[i]);
+            }
+        }
+
         free(comm->procs);
+    }
+
     OBJ_DESTRUCT(&comm->wild_receives);
     OBJ_DESTRUCT(&comm->matching_lock);
 }
@@ -80,18 +86,13 @@ OBJ_CLASS_INSTANCE(
     mca_pml_ob1_comm_destruct);
 
 
-int mca_pml_ob1_comm_init_size(mca_pml_ob1_comm_t* comm, size_t size)
+int mca_pml_ob1_comm_init_size (mca_pml_ob1_comm_t* comm, size_t size)
 {
-    size_t i;
-
     /* send message sequence-number support - sender side */
-    comm->procs = (mca_pml_ob1_comm_proc_t*)malloc(sizeof(mca_pml_ob1_comm_proc_t)*size);
+    comm->procs = (mca_pml_ob1_comm_proc_t **) calloc(size, sizeof (mca_pml_ob1_comm_proc_t *));
     if(NULL == comm->procs) {
         return OMPI_ERR_OUT_OF_RESOURCE;
     }
-    for(i=0; i<size; i++) {
-        OBJ_CONSTRUCT(comm->procs+i, mca_pml_ob1_comm_proc_t);
-    }
     comm->num_procs = size;
     return OMPI_SUCCESS;
 }
diff --git a/ompi/mca/pml/ob1/pml_ob1_comm.h b/ompi/mca/pml/ob1/pml_ob1_comm.h
index 411310575d..7ef54c63e6 100644
--- a/ompi/mca/pml/ob1/pml_ob1_comm.h
+++ b/ompi/mca/pml/ob1/pml_ob1_comm.h
@@ -24,6 +24,7 @@
 #include "opal/threads/mutex.h"
 #include "opal/class/opal_list.h"
 #include "ompi/proc/proc.h"
+#include "ompi/communicator/communicator.h"
 BEGIN_C_DECLS
 
 
@@ -42,6 +43,7 @@ struct mca_pml_ob1_comm_proc_t {
 };
 typedef struct mca_pml_ob1_comm_proc_t mca_pml_ob1_comm_proc_t;
 
+OBJ_CLASS_DECLARATION(mca_pml_ob1_comm_proc_t);
 
 /**
  *  Cached on ompi_communicator_t to hold queues/state
@@ -56,7 +58,7 @@ struct mca_pml_comm_t {
 #endif
     opal_mutex_t matching_lock;   /**< matching lock */
     opal_list_t wild_receives;    /**< queue of unmatched wild (source process not specified) receives */
-    mca_pml_ob1_comm_proc_t* procs;
+    mca_pml_ob1_comm_proc_t **procs;
     size_t num_procs;
     size_t last_probed;
 };
@@ -64,6 +66,18 @@ typedef struct mca_pml_comm_t mca_pml_ob1_comm_t;
 
 OBJ_CLASS_DECLARATION(mca_pml_ob1_comm_t);
 
+static inline mca_pml_ob1_comm_proc_t *mca_pml_ob1_peer_lookup (struct ompi_communicator_t *comm, int rank)
+{
+    mca_pml_ob1_comm_t *pml_comm = (mca_pml_ob1_comm_t *)comm->c_pml_comm;
+
+    if (OPAL_UNLIKELY(NULL == pml_comm->procs[rank])) {
+        pml_comm->procs[rank] = OBJ_NEW(mca_pml_ob1_comm_proc_t);
+        pml_comm->procs[rank]->ompi_proc = ompi_comm_peer_lookup (comm, rank);
+        OBJ_RETAIN(pml_comm->procs[rank]->ompi_proc);
+    }
+
+    return pml_comm->procs[rank];
+}
 
 /**
  * Initialize an instance of mca_pml_ob1_comm_t based on the communicator size.
diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c
index f3a3434a1c..0e17d1e64f 100644
--- a/ompi/mca/pml/ob1/pml_ob1_component.c
+++ b/ompi/mca/pml/ob1/pml_ob1_component.c
@@ -144,9 +144,12 @@ static int mca_pml_ob1_get_unex_msgq_size (const struct mca_base_pvar_t *pvar, v
     int i;
 
     for (i = 0 ; i < comm_size ; ++i) {
-        pml_proc = pml_comm->procs + i;
-
-        values[i] = opal_list_get_size (&pml_proc->unexpected_frags);
+        pml_proc = pml_comm->procs[i];
+        if (pml_proc) {
+            values[i] = opal_list_get_size (&pml_proc->unexpected_frags);
+        } else {
+            values[i] = 0;
+        }
     }
 
     return OMPI_SUCCESS;
@@ -162,9 +165,13 @@ static int mca_pml_ob1_get_posted_recvq_size (const struct mca_base_pvar_t *pvar
     int i;
 
     for (i = 0 ; i < comm_size ; ++i) {
-        pml_proc = pml_comm->procs + i;
+        pml_proc = pml_comm->procs[i];
 
-        values[i] = opal_list_get_size (&pml_proc->specific_receives);
+        if (pml_proc) {
+            values[i] = opal_list_get_size (&pml_proc->specific_receives);
+        } else {
+            values[i] = 0;
+        }
     }
 
     return OMPI_SUCCESS;
diff --git a/ompi/mca/pml/ob1/pml_ob1_irecv.c b/ompi/mca/pml/ob1/pml_ob1_irecv.c
index 787a6e0139..16ffcf4f21 100644
--- a/ompi/mca/pml/ob1/pml_ob1_irecv.c
+++ b/ompi/mca/pml/ob1/pml_ob1_irecv.c
@@ -148,7 +148,6 @@ mca_pml_ob1_imrecv( void *buf,
     int src, tag;
     ompi_communicator_t *comm;
     mca_pml_ob1_comm_proc_t* proc;
-    mca_pml_ob1_comm_t* ob1_comm;
     uint64_t seq;
 
     /* get the request from the message and the frag from the request
@@ -158,7 +157,6 @@ mca_pml_ob1_imrecv( void *buf,
     src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
     tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG;
     comm = (*message)->comm;
-    ob1_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm;
     seq = recvreq->req_recv.req_base.req_sequence;
 
     /* make the request a recv request again */
@@ -196,7 +194,7 @@ mca_pml_ob1_imrecv( void *buf,
     /* Note - sequence number already assigned */
     recvreq->req_recv.req_base.req_sequence = seq;
 
-    proc = &ob1_comm->procs[recvreq->req_recv.req_base.req_peer];
+    proc = mca_pml_ob1_peer_lookup (comm, recvreq->req_recv.req_base.req_peer);
     recvreq->req_recv.req_base.req_proc = proc->ompi_proc;
     prepare_recv_req_converter(recvreq);
 
@@ -243,7 +241,6 @@ mca_pml_ob1_mrecv( void *buf,
     int src, tag, rc;
     ompi_communicator_t *comm;
     mca_pml_ob1_comm_proc_t* proc;
-    mca_pml_ob1_comm_t* ob1_comm;
     uint64_t seq;
 
     /* get the request from the message and the frag from the request
@@ -254,7 +251,6 @@ mca_pml_ob1_mrecv( void *buf,
     src = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
     tag = recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG;
     seq = recvreq->req_recv.req_base.req_sequence;
-    ob1_comm = recvreq->req_recv.req_base.req_comm->c_pml_comm;
 
     /* make the request a recv request again */
     /* The old request kept pointers to comm and the char datatype.
@@ -290,7 +286,7 @@ mca_pml_ob1_mrecv( void *buf,
     /* Note - sequence number already assigned */
     recvreq->req_recv.req_base.req_sequence = seq;
 
-    proc = &ob1_comm->procs[recvreq->req_recv.req_base.req_peer];
+    proc = mca_pml_ob1_peer_lookup (comm, recvreq->req_recv.req_base.req_peer);
     recvreq->req_recv.req_base.req_proc = proc->ompi_proc;
     prepare_recv_req_converter(recvreq);
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c
index a25a7250b2..5de0c89d8a 100644
--- a/ompi/mca/pml/ob1/pml_ob1_isend.c
+++ b/ompi/mca/pml/ob1/pml_ob1_isend.c
@@ -126,15 +126,14 @@ int mca_pml_ob1_isend(const void *buf,
                       ompi_communicator_t * comm,
                       ompi_request_t ** request)
 {
-    mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm;
+    mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst);
     mca_pml_ob1_send_request_t *sendreq = NULL;
-    ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst);
-    mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
-                                        dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+    ompi_proc_t *dst_proc = ob1_proc->ompi_proc;
+    mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc);
     int16_t seqn;
     int rc;
 
-    seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1);
+    seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1);
 
     if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) {
         rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc,
@@ -176,10 +175,9 @@ int mca_pml_ob1_send(const void *buf,
                      mca_pml_base_send_mode_t sendmode,
                      ompi_communicator_t * comm)
 {
-    mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm;
-    ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst);
-    mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
-                                        dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+    mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst);
+    ompi_proc_t *dst_proc = ob1_proc->ompi_proc;
+    mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc);
     mca_pml_ob1_send_request_t *sendreq = NULL;
     int16_t seqn;
     int rc;
@@ -202,7 +200,7 @@ int mca_pml_ob1_send(const void *buf,
         return OMPI_ERR_UNREACH;
     }
 
-    seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1);
+    seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1);
 
     /**
      * The immediate send will not have a request, so they are
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
index 16a7636e55..797d276b61 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@@ -143,7 +143,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
     comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
 
     /* source sequence number */
-    proc = &comm->procs[hdr->hdr_src];
+    proc = mca_pml_ob1_peer_lookup (comm_ptr, hdr->hdr_src);
 
     /* We generate the MSG_ARRIVED event as soon as the PML is aware
      * of a matching fragment arrival. Independing if it is received
@@ -650,7 +650,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
 
     /* source sequence number */
     frag_msg_seq = hdr->hdr_seq;
-    proc = &comm->procs[hdr->hdr_src];
+    proc = mca_pml_ob1_peer_lookup (comm_ptr, hdr->hdr_src);
 
     /**
      * We generate the MSG_ARRIVED event as soon as the PML is aware of a matching
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 792ae45a9c..fdbc130973 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -100,7 +100,8 @@ static int mca_pml_ob1_recv_request_free(struct ompi_request_t** request)
 static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request, int complete)
 {
     mca_pml_ob1_recv_request_t* request = (mca_pml_ob1_recv_request_t*)ompi_request;
-    mca_pml_ob1_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm;
+    ompi_communicator_t *comm = request->req_recv.req_base.req_comm;
+    mca_pml_ob1_comm_t *ob1_comm = comm->c_pml_comm;
 
     if( true == request->req_match_received ) { /* way to late to cancel this one */
         assert( OMPI_ANY_TAG != ompi_request->req_status.MPI_TAG ); /* not matched isn't it */
@@ -108,11 +109,11 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request,
     }
 
     /* The rest should be protected behind the match logic lock */
-    OPAL_THREAD_LOCK(&comm->matching_lock);
+    OPAL_THREAD_LOCK(&ob1_comm->matching_lock);
     if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) {
-        opal_list_remove_item( &comm->wild_receives, (opal_list_item_t*)request );
+        opal_list_remove_item( &ob1_comm->wild_receives, (opal_list_item_t*)request );
     } else {
-        mca_pml_ob1_comm_proc_t* proc = comm->procs + request->req_recv.req_base.req_peer;
+        mca_pml_ob1_comm_proc_t* proc = mca_pml_ob1_peer_lookup (comm, request->req_recv.req_base.req_peer);
         opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request);
     }
     PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
@@ -122,7 +123,7 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request,
      * to true. Otherwise, the request will never be freed.
      */
     request->req_recv.req_base.req_pml_complete = true;
-    OPAL_THREAD_UNLOCK(&comm->matching_lock);
+    OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock);
 
     OPAL_THREAD_LOCK(&ompi_request_lock);
     ompi_request->req_status._cancelled = true;
@@ -260,7 +261,7 @@ static int mca_pml_ob1_recv_request_ack(
     ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
     mca_bml_base_endpoint_t* bml_endpoint = NULL;
 
-    bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+    bml_endpoint = mca_bml_base_get_endpoint (proc);
 
     /* by default copy everything */
     recvreq->req_send_offset = bytes_received;
@@ -654,7 +655,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
     }
 
     /* lookup bml datastructures */
-    bml_endpoint = (mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+    bml_endpoint = mca_bml_base_get_endpoint (recvreq->req_recv.req_base.req_proc);
     rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
 
 #if OPAL_CUDA_SUPPORT
@@ -1079,8 +1080,11 @@ static mca_pml_ob1_recv_frag_t*
 recv_req_match_specific_proc( const mca_pml_ob1_recv_request_t *req,
                               mca_pml_ob1_comm_proc_t *proc )
 {
+    if (NULL == proc) {
+        return NULL;
+    }
+
     opal_list_t* unexpected_frags = &proc->unexpected_frags;
-    opal_list_item_t *i;
     mca_pml_ob1_recv_frag_t* frag;
     int tag = req->req_recv.req_base.req_tag;
 
@@ -1088,20 +1092,12 @@ recv_req_match_specific_proc( const mca_pml_ob1_recv_request_t *req,
         return NULL;
 
     if( OMPI_ANY_TAG == tag ) {
-        for (i =  opal_list_get_first(unexpected_frags);
-             i != opal_list_get_end(unexpected_frags);
-             i =  opal_list_get_next(i)) {
-            frag = (mca_pml_ob1_recv_frag_t*)i;
-
+        OPAL_LIST_FOREACH(frag, unexpected_frags, mca_pml_ob1_recv_frag_t) {
             if( frag->hdr.hdr_match.hdr_tag >= 0 )
                 return frag;
         }
     } else {
-        for (i =  opal_list_get_first(unexpected_frags);
-             i != opal_list_get_end(unexpected_frags);
-             i =  opal_list_get_next(i)) {
-            frag = (mca_pml_ob1_recv_frag_t*)i;
-
+        OPAL_LIST_FOREACH(frag, unexpected_frags, mca_pml_ob1_recv_frag_t) {
             if( frag->hdr.hdr_match.hdr_tag == tag )
                 return frag;
         }
@@ -1118,7 +1114,7 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req,
                      mca_pml_ob1_comm_proc_t **p)
 {
     mca_pml_ob1_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm;
-    mca_pml_ob1_comm_proc_t* proc = comm->procs;
+    mca_pml_ob1_comm_proc_t **procp = comm->procs;
     size_t i;
 
     /*
@@ -1133,10 +1129,10 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req,
         mca_pml_ob1_recv_frag_t* frag;
 
         /* loop over messages from the current proc */
-        if((frag = recv_req_match_specific_proc(req, &proc[i]))) {
-            *p = &proc[i];
+        if((frag = recv_req_match_specific_proc(req, procp[i]))) {
+            *p = procp[i];
             comm->last_probed = i;
-            req->req_recv.req_base.req_proc = proc[i].ompi_proc;
+            req->req_recv.req_base.req_proc = procp[i]->ompi_proc;
             prepare_recv_req_converter(req);
             return frag; /* match found */
         }
@@ -1145,10 +1141,10 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req,
         mca_pml_ob1_recv_frag_t* frag;
 
         /* loop over messages from the current proc */
-        if((frag = recv_req_match_specific_proc(req, &proc[i]))) {
-            *p = &proc[i];
+        if((frag = recv_req_match_specific_proc(req, procp[i]))) {
+            *p = procp[i];
             comm->last_probed = i;
-            req->req_recv.req_base.req_proc = proc[i].ompi_proc;
+            req->req_recv.req_base.req_proc = procp[i]->ompi_proc;
             prepare_recv_req_converter(req);
             return frag; /* match found */
         }
@@ -1161,7 +1157,8 @@ recv_req_match_wild( mca_pml_ob1_recv_request_t* req,
 
 void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
 {
-    mca_pml_ob1_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm;
+    ompi_communicator_t *comm = req->req_recv.req_base.req_comm;
+    mca_pml_ob1_comm_t *ob1_comm = comm->c_pml_comm;
     mca_pml_ob1_comm_proc_t* proc;
     mca_pml_ob1_recv_frag_t* frag;
     opal_list_t *queue;
@@ -1179,7 +1176,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
 
     MCA_PML_BASE_RECV_START(&req->req_recv.req_base);
 
-    OPAL_THREAD_LOCK(&comm->matching_lock);
+    OPAL_THREAD_LOCK(&ob1_comm->matching_lock);
     /**
      * The laps of time between the ACTIVATE event and the SEARCH_UNEX one include
      * the cost of the request lock.
@@ -1188,12 +1185,12 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
                             &(req->req_recv.req_base), PERUSE_RECV);
 
     /* assign sequence number */
-    req->req_recv.req_base.req_sequence = comm->recv_sequence++;
+    req->req_recv.req_base.req_sequence = ob1_comm->recv_sequence++;
 
     /* attempt to match posted recv */
     if(req->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) {
         frag = recv_req_match_wild(req, &proc);
-        queue = &comm->wild_receives;
+        queue = &ob1_comm->wild_receives;
 #if !OPAL_ENABLE_HETEROGENEOUS_SUPPORT
         /* As we are in a homogeneous environment we know that all remote
          * architectures are exactly the same as the local one. Therefore,
@@ -1206,7 +1203,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
         }
 #endif  /* !OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
     } else {
-        proc = &comm->procs[req->req_recv.req_base.req_peer];
+        proc = mca_pml_ob1_peer_lookup (comm, req->req_recv.req_base.req_peer);
         req->req_recv.req_base.req_proc = proc->ompi_proc;
         frag = recv_req_match_specific_proc(req, proc);
         queue = &proc->specific_receives;
@@ -1221,7 +1218,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
            it when the message comes in. */
         append_recv_req_to_queue(queue, req);
         req->req_match_received = false;
-        OPAL_THREAD_UNLOCK(&comm->matching_lock);
+        OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock);
     } else {
         if(OPAL_LIKELY(!IS_PROB_REQ(req))) {
             PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_MATCH_UNEX,
@@ -1239,7 +1236,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
 
             opal_list_remove_item(&proc->unexpected_frags,
                                   (opal_list_item_t*)frag);
-            OPAL_THREAD_UNLOCK(&comm->matching_lock);
+            OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock);
 
             switch(hdr->hdr_common.hdr_type) {
             case MCA_PML_OB1_HDR_TYPE_MATCH:
@@ -1269,14 +1266,14 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req)
                restarted with this request during mrecv */
             opal_list_remove_item(&proc->unexpected_frags,
                                   (opal_list_item_t*)frag);
-            OPAL_THREAD_UNLOCK(&comm->matching_lock);
+            OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock);
 
             req->req_recv.req_base.req_addr = frag;
             mca_pml_ob1_recv_request_matched_probe(req, frag->btl,
                                                    frag->segments, frag->num_segments);
 
         } else {
-            OPAL_THREAD_UNLOCK(&comm->matching_lock);
+            OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock);
             mca_pml_ob1_recv_request_matched_probe(req, frag->btl,
                                                    frag->segments, frag->num_segments);
         }
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
index a9e54f6c66..71fb8c3d5b 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@@ -433,8 +433,7 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
 {
     size_t i;
     mca_bml_base_btl_t* bml_btl;
-    mca_bml_base_endpoint_t* endpoint =
-        (mca_bml_base_endpoint_t*)proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+    mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (proc);
 
     for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
         bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
index d9d3bb13a8..9659dabb0b 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@@ -480,16 +480,16 @@ mca_pml_ob1_send_request_start_seq (mca_pml_ob1_send_request_t* sendreq, mca_bml
 static inline int
 mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq )
 {
-    mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
-                                        sendreq->req_send.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
-    mca_pml_ob1_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
+    mca_bml_base_endpoint_t *endpoint = mca_bml_base_get_endpoint (sendreq->req_send.req_base.req_proc);
+    ompi_communicator_t *comm = sendreq->req_send.req_base.req_comm;
+    mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, sendreq->req_send.req_base.req_peer);
     int32_t seqn;
 
     if (OPAL_UNLIKELY(NULL == endpoint)) {
         return OMPI_ERR_UNREACH;
     }
 
-    seqn = OPAL_THREAD_ADD32(&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence, 1);
+    seqn = OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1);
 
     return mca_pml_ob1_send_request_start_seq (sendreq, endpoint, seqn);
 }
diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c
index 9747fd7da1..98c4d2a70d 100644
--- a/ompi/proc/proc.c
+++ b/ompi/proc/proc.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -10,7 +11,7 @@
  * Copyright (c) 2004-2006 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006-2014 Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2012      Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2012-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * Copyright (c) 2013-2015 Intel, Inc. All rights reserved
  * Copyright (c) 2014      Research Organization for Information Science
@@ -43,6 +44,8 @@
 
 static opal_list_t  ompi_proc_list;
 static opal_mutex_t ompi_proc_lock;
+static opal_hash_table_t ompi_proc_hash;
+
 ompi_proc_t* ompi_proc_local_proc = NULL;
 
 static void ompi_proc_construct(ompi_proc_t* proc);
@@ -83,49 +86,223 @@ void ompi_proc_destruct(ompi_proc_t* proc)
     }
     OPAL_THREAD_LOCK(&ompi_proc_lock);
     opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
+    opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name));
     OPAL_THREAD_UNLOCK(&ompi_proc_lock);
 }
 
+/**
+ * Allocate a new ompi_proc_T for the given jobid/vpid
+ *
+ * @param[in]  jobid Job identifier
+ * @param[in]  vpid  Process identifier
+ * @param[out] procp New ompi_proc_t structure
+ *
+ * This function allocates a new ompi_proc_t and inserts it into
+ * the process list and hash table.
+ */
+static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t **procp) {
+    ompi_proc_t *proc = OBJ_NEW(ompi_proc_t);
+
+    opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc);
+
+    OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = jobid;
+    OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid = vpid;
+
+    opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name),
+                                   proc);
+
+    *procp = proc;
+
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Finish setting up an ompi_proc_t
+ *
+ * @param[in] proc ompi process structure
+ *
+ * This function contains the core code of ompi_proc_complete_init() and
+ * ompi_proc_refresh(). The tasks performed by this function include
+ * retrieving the hostname (if below the modex cutoff), determining the
+ * remote architecture, and calculating the locality of the process.
+ */
+static int ompi_proc_complete_init_single (ompi_proc_t *proc)
+{
+    uint16_t u16, *u16ptr;
+    int ret;
+
+    u16ptr = &u16;
+
+    if (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid) {
+        /* nothing else to do */
+        return OMPI_SUCCESS;
+    }
+
+    /* get the locality information - all RTEs are required
+     * to provide this information at startup */
+    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
+    if (OPAL_SUCCESS != ret) {
+        proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
+    } else {
+        proc->super.proc_flags = u16;
+    }
+
+    /* we can retrieve the hostname at no cost because it
+     * was provided at startup */
+    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
+			  (char**)&(proc->super.proc_hostname), OPAL_STRING);
+    if (OPAL_SUCCESS != ret) {
+	return ret;
+    }
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    /* get the remote architecture - this might force a modex except
+     * for those environments where the RM provides it */
+    {
+        uint32_t *ui32ptr;
+        ui32ptr = &(proc->super.proc_arch);
+        OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name,
+                              (void**)&ui32ptr, OPAL_UINT32);
+        if (OPAL_SUCCESS == ret) {
+            /* if arch is different than mine, create a new convertor for this proc */
+            if (proc->super.proc_arch != opal_local_arch) {
+                OBJ_RELEASE(proc->super.proc_convertor);
+                proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0);
+            }
+        } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
+            proc->super.proc_arch = opal_local_arch;
+        } else {
+            return ret;
+        }
+    }
+#else
+    /* must be same arch as my own */
+    proc->super.proc_arch = opal_local_arch;
+#endif
+
+    return OMPI_SUCCESS;
+}
+
+opal_proc_t *ompi_proc_lookup (const opal_process_name_t proc_name)
+{
+    ompi_proc_t *proc = NULL;
+    int ret;
+
+    /* try to lookup the value in the hash table */
+    ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc);
+
+    if (OPAL_SUCCESS == ret) {
+        return &proc->super;
+    }
+
+    return NULL;
+}
+
+opal_proc_t *ompi_proc_for_name (const opal_process_name_t proc_name)
+{
+    ompi_proc_t *proc = NULL;
+    int ret;
+
+    /* try to lookup the value in the hash table */
+    ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc);
+    if (OPAL_SUCCESS == ret) {
+        return &proc->super;
+    }
+
+    OPAL_THREAD_LOCK(&ompi_proc_lock);
+    do {
+        /* double-check that another competing thread has not added this proc */
+        ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc);
+        if (OPAL_SUCCESS == ret) {
+            break;
+        }
+
+        /* allocate a new ompi_proc_t object for the process and insert it into the process table */
+        ret = ompi_proc_allocate (proc_name.jobid, proc_name.vpid, &proc);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            /* allocation fail */
+            break;
+        }
+
+        /* finish filling in the important proc data fields */
+        ret = ompi_proc_complete_init_single (proc);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            proc = NULL;
+            break;
+        }
+    } while (0);
+    OPAL_THREAD_UNLOCK(&ompi_proc_lock);
+
+    return (opal_proc_t *) proc;
+}
 
 int ompi_proc_init(void)
 {
-    ompi_vpid_t i;
-#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    int opal_proc_hash_init_size = (ompi_process_info.num_procs < ompi_add_procs_cutoff) ? ompi_process_info.num_procs :
+        1024;
+    ompi_proc_t *proc;
     int ret;
-#endif
 
     OBJ_CONSTRUCT(&ompi_proc_list, opal_list_t);
     OBJ_CONSTRUCT(&ompi_proc_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&ompi_proc_hash, opal_hash_table_t);
 
-    /* create proc structures and find self */
-    for( i = 0; i < ompi_process_info.num_procs; i++ ) {
-        ompi_proc_t *proc = OBJ_NEW(ompi_proc_t);
-        opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc);
+    ret = opal_hash_table_init (&ompi_proc_hash, opal_proc_hash_init_size);
+    if (OPAL_SUCCESS != ret) {
+        return ret;
+    }
 
-        OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = OMPI_PROC_MY_NAME->jobid;
-        OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid = i;
+    /* create a proc for the local process */
+    ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, OMPI_PROC_MY_NAME->vpid, &proc);
+    if (OMPI_SUCCESS != ret) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
 
-        if (i == OMPI_PROC_MY_NAME->vpid) {
-            ompi_proc_local_proc = proc;
-            proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
-            proc->super.proc_hostname = strdup(ompi_process_info.nodename);
-            proc->super.proc_arch = opal_local_arch;
-            /* Register the local proc with OPAL */
-            opal_proc_local_set(&proc->super);
+    /* set local process data */
+    ompi_proc_local_proc = proc;
+    proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
+    proc->super.proc_hostname = strdup(ompi_process_info.nodename);
+    proc->super.proc_arch = opal_local_arch;
+    /* Register the local proc with OPAL */
+    opal_proc_local_set(&proc->super);
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
-            /* add our arch to the modex */
-            OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL,
-                                  OPAL_PMIX_ARCH, &opal_local_arch, OPAL_UINT32);
-            if (OPAL_SUCCESS != ret) {
+    /* add our arch to the modex */
+    OPAL_MODEX_SEND_VALUE(ret, PMIX_GLOBAL,
+                          OPAL_PMIX_ARCH, &opal_local_arch, OPAL_UINT32);
+    if (OPAL_SUCCESS != ret) {
+        return ret;
+    }
+#endif
+
+    if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
+        /* create proc structures and find self */
+        for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
+            if (i == OMPI_PROC_MY_NAME->vpid) {
+                continue;
+            }
+
+            ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc);
+            if (OMPI_SUCCESS != ret) {
                 return ret;
             }
-#endif
         }
     }
 
     return OMPI_SUCCESS;
 }
 
+static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
+{
+    ompi_proc_t *proca = (ompi_proc_t *) *a;
+    ompi_proc_t *procb = (ompi_proc_t *) *b;
+
+    if (proca->super.proc_name.vpid > procb->super.proc_name.vpid) {
+        return 1;
+    } else {
+        return -1;
+    }
+
+    /* they should never be equal */
+}
 
 /**
  * The process creation is split into two steps. The second step
@@ -140,58 +317,47 @@ int ompi_proc_complete_init(void)
 {
     ompi_proc_t *proc;
     int ret, errcode = OMPI_SUCCESS;
-    uint16_t u16, *u16ptr;
 
     OPAL_THREAD_LOCK(&ompi_proc_lock);
-    u16ptr = &u16;
 
     OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
-        if (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid != OMPI_PROC_MY_NAME->vpid) {
-            /* get the locality information - all RTEs are required
-            * to provide this information at startup */
-            OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
-            if (OPAL_SUCCESS != ret) {
-                proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
-            } else {
-                proc->super.proc_flags = u16;
-            }
-
-            /* we can retrieve the hostname at no cost because it
-             * was provided at startup */
-            OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
-                                  (char**)&(proc->super.proc_hostname), OPAL_STRING);
-            if (OPAL_SUCCESS != ret) {
-                /* we can live without it */
-                proc->super.proc_hostname = NULL;
-            }
-#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
-            /* get the remote architecture - this might force a modex except
-             * for those environments where the RM provides it */
-            {
-                uint32_t *ui32ptr;
-                ui32ptr = &(proc->super.proc_arch);
-                OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name,
-                                      (void**)&ui32ptr, OPAL_UINT32);
-                if (OPAL_SUCCESS == ret) {
-                    /* if arch is different than mine, create a new convertor for this proc */
-                    if (proc->super.proc_arch != opal_local_arch) {
-                        OBJ_RELEASE(proc->super.proc_convertor);
-                        proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0);
-                    }
-                } else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
-                    proc->super.proc_arch = opal_local_arch;
-                } else {
-                    errcode = ret;
-                    break;
-                }
-            }
-#else
-            /* must be same arch as my own */
-            proc->super.proc_arch = opal_local_arch;
-#endif
+        ret = ompi_proc_complete_init_single (proc);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+            errcode = ret;
+            break;
         }
     }
     OPAL_THREAD_UNLOCK(&ompi_proc_lock);
+
+    if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) {
+        uint16_t u16, *u16ptr;
+
+        u16ptr = &u16;
+
+        /* find and add all local processes */
+        for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
+            opal_process_name_t proc_name = {.vpid = i, .jobid = OMPI_PROC_MY_NAME->jobid};
+            uint16_t locality = OPAL_PROC_NON_LOCAL;
+
+            if (OMPI_PROC_MY_NAME->vpid == i) {
+                continue;
+            }
+
+            /* the runtime is required to fill in locality for all local processes by this
+             * point. only local processes will have locality set */
+            OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16);
+            if (OPAL_SUCCESS == ret) {
+                locality = u16;
+            }
+
+            if (OPAL_PROC_NON_LOCAL != locality) {
+	      (void) ompi_proc_for_name (proc_name);
+            }
+        }
+    }
+
+    opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid);
+
     return errcode;
 }
 
@@ -227,6 +393,7 @@ int ompi_proc_finalize (void)
     /* now destruct the list and thread lock */
     OBJ_DESTRUCT(&ompi_proc_list);
     OBJ_DESTRUCT(&ompi_proc_lock);
+    OBJ_DESTRUCT(&ompi_proc_hash);
 
     return OMPI_SUCCESS;
 }
@@ -248,9 +415,7 @@ ompi_proc_t** ompi_proc_world(size_t *size)
 
     /* First count how many match this jobid */
     OPAL_THREAD_LOCK(&ompi_proc_lock);
-    for (proc =  (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
-         proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
-         proc =  (ompi_proc_t*)opal_list_get_next(proc)) {
+    OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
         if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, OMPI_CAST_RTE_NAME(&proc->super.proc_name), &my_name)) {
             ++count;
         }
@@ -265,9 +430,7 @@ ompi_proc_t** ompi_proc_world(size_t *size)
 
     /* now save only the procs that match this jobid */
     count = 0;
-    for (proc =  (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
-         proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
-         proc =  (ompi_proc_t*)opal_list_get_next(proc)) {
+    OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
         if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, &my_name)) {
             /* DO NOT RETAIN THIS OBJECT - the reference count on this
              * object will be adjusted by external callers. The intent
@@ -305,9 +468,7 @@ ompi_proc_t** ompi_proc_all(size_t* size)
     }
 
     OPAL_THREAD_LOCK(&ompi_proc_lock);
-    for(proc =  (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
-        proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
-        proc =  (ompi_proc_t*)opal_list_get_next(proc)) {
+    OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
         /* We know this isn't consistent with the behavior in ompi_proc_world,
          * but we are leaving the RETAIN for now because the code using this function
          * assumes that the results need to be released when done. It will
@@ -349,9 +510,7 @@ ompi_proc_t * ompi_proc_find ( const ompi_process_name_t * name )
     /* return the proc-struct which matches this jobid+process id */
     mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID;
     OPAL_THREAD_LOCK(&ompi_proc_lock);
-    for(proc =  (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
-        proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
-        proc =  (ompi_proc_t*)opal_list_get_next(proc)) {
+    OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
         if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, name)) {
             rproc = proc;
             break;
@@ -366,21 +525,14 @@ ompi_proc_t * ompi_proc_find ( const ompi_process_name_t * name )
 int ompi_proc_refresh(void)
 {
     ompi_proc_t *proc = NULL;
-    opal_list_item_t *item = NULL;
     ompi_vpid_t i = 0;
     int ret=OMPI_SUCCESS;
-    uint16_t u16, *u16ptr;
 
     OPAL_THREAD_LOCK(&ompi_proc_lock);
 
-    for( item  = opal_list_get_first(&ompi_proc_list), i = 0;
-         item != opal_list_get_end(&ompi_proc_list);
-         item  = opal_list_get_next(item), ++i ) {
-        proc = (ompi_proc_t*)item;
-
+    OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
         /* Does not change: proc->super.proc_name.vpid */
         OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = OMPI_PROC_MY_NAME->jobid;
-        u16ptr = &u16;
 
         /* Make sure to clear the local flag before we set it below */
         proc->super.proc_flags = 0;
@@ -392,56 +544,10 @@ int ompi_proc_refresh(void)
             proc->super.proc_arch = opal_local_arch;
             opal_proc_local_set(&proc->super);
         } else {
-            /* get the locality information - all RTEs are required
-            * to provide this information at startup */
-            OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
-            if (OPAL_SUCCESS != ret) {
-                proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
-            } else {
-                proc->super.proc_flags = u16;
+            ret = ompi_proc_complete_init_single (proc);
+            if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+                break;
             }
-
-            if (ompi_process_info.num_procs < ompi_direct_modex_cutoff) {
-                /* IF the number of procs falls below the specified cutoff,
-                 * then we assume the job is small enough that retrieving
-                 * the hostname (which will typically cause retrieval of
-                 * ALL modex info for this proc) will have no appreciable
-                 * impact on launch scaling
-                 */
-                OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
-                                      (char**)&(proc->super.proc_hostname), OPAL_STRING);
-                if (OMPI_SUCCESS != ret) {
-                    break;
-                }
-            } else {
-                /* just set the hostname to NULL for now - we'll fill it in
-                 * as modex_recv's are called for procs we will talk to, thus
-                 * avoiding retrieval of ALL modex info for this proc until
-                 * required. Transports that delay calling modex_recv until
-                 * first message will therefore scale better than those that
-                 * call modex_recv on all procs during init.
-                 */
-                proc->super.proc_hostname = NULL;
-            }
-#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
-            {
-                /* get the remote architecture */
-                uint32_t* uiptr = &(proc->super.proc_arch);
-                OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name,
-                                      (void**)&uiptr, OPAL_UINT32);
-                if (OMPI_SUCCESS != ret) {
-                    break;
-                }
-                /* if arch is different than mine, create a new convertor for this proc */
-                if (proc->super.proc_arch != opal_local_arch) {
-                    OBJ_RELEASE(proc->super.proc_convertor);
-                    proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0);
-                }
-            }
-#else
-            /* must be same arch as my own */
-            proc->super.proc_arch = opal_local_arch;
-#endif
         }
     }
 
@@ -454,7 +560,7 @@ int
 ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
                opal_buffer_t* buf)
 {
-    int i, rc;
+    int rc;
 
     OPAL_THREAD_LOCK(&ompi_proc_lock);
 
@@ -470,7 +576,7 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
      * reduced. For now, just go ahead and pack the info so it
      * can be sent.
      */
-    for (i=0; i<proclistsize; i++) {
+    for (int i = 0 ; i < proclistsize ; ++i) {
         rc = opal_dss.pack(buf, &(proclist[i]->super.proc_name), 1, OMPI_NAME);
         if(rc != OPAL_SUCCESS) {
             OMPI_ERROR_LOG(rc);
@@ -503,9 +609,7 @@ ompi_proc_find_and_add(const ompi_process_name_t * name, bool* isnew)
     /* return the proc-struct which matches this jobid+process id */
     mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID;
     OPAL_THREAD_LOCK(&ompi_proc_lock);
-    for(proc =  (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
-        proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
-        proc =  (ompi_proc_t*)opal_list_get_next(proc)) {
+    OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
         if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, name)) {
             rproc = proc;
             *isnew = false;
@@ -538,7 +642,6 @@ ompi_proc_unpack(opal_buffer_t* buf,
                  int proclistsize, ompi_proc_t ***proclist,
                  int *newproclistsize, ompi_proc_t ***newproclist)
 {
-    int i;
     size_t newprocs_len = 0;
     ompi_proc_t **plist=NULL, **newprocs = NULL;
 
@@ -558,7 +661,7 @@ ompi_proc_unpack(opal_buffer_t* buf,
     /* cycle through the array of provided procs and unpack
      * their info - as packed by ompi_proc_pack
      */
-    for (i=0; i<proclistsize; i++){
+    for (int i = 0; i < proclistsize ; ++i){
         int32_t count=1;
         ompi_process_name_t new_name;
         uint32_t new_arch;
diff --git a/ompi/proc/proc.h b/ompi/proc/proc.h
index f178238365..2ad92e815a 100644
--- a/ompi/proc/proc.h
+++ b/ompi/proc/proc.h
@@ -304,6 +304,35 @@ OMPI_DECLSPEC int ompi_proc_unpack(opal_buffer_t *buf,
  */
 OMPI_DECLSPEC int ompi_proc_refresh(void);
 
+/**
+ * Get the ompi_proc_t for a given process name
+ *
+ * @param[in] proc_name opal process name
+ *
+ * @returns cached or new ompi_proc_t for the given process name
+ *
+ * This function looks up the given process name in the hash of existing
+ * ompi_proc_t structures. If no ompi_proc_t structure exists matching the
+ * given name a new ompi_proc_t is allocated, initialized, and returned.
+ *
+ * @note The ompi_proc_t is added to the local list of processes but is not
+ * added to any communicator. ompi_comm_peer_lookup is responsible for caching
+ * the ompi_proc_t on a communicator.
+ */
+OMPI_DECLSPEC opal_proc_t *ompi_proc_for_name (const opal_process_name_t proc_name);
+
+
+OMPI_DECLSPEC opal_proc_t *ompi_proc_lookup (const opal_process_name_t proc_name);
+
+
+static inline intptr_t ompi_proc_name_to_sentinel (opal_process_name_t name) {
+  return -*((intptr_t *) &name);
+}
+
+static inline opal_process_name_t ompi_proc_sentinel_to_name (intptr_t sentinel) {
+  sentinel = -sentinel;
+  return *((opal_process_name_t *) &sentinel);
+}
 
 END_C_DECLS
 
diff --git a/ompi/runtime/ompi_mpi_abort.c b/ompi/runtime/ompi_mpi_abort.c
index 278711b056..7638a01a9e 100644
--- a/ompi/runtime/ompi_mpi_abort.c
+++ b/ompi/runtime/ompi_mpi_abort.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -13,6 +14,8 @@
  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
  * Copyright (c) 2014      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -88,7 +91,7 @@ static void try_kill_peers(ompi_communicator_t *comm,
         } else {
             assert(count <= nprocs);
             procs[count++] =
-                *OMPI_CAST_RTE_NAME(&ompi_group_get_proc_ptr(comm->c_remote_group, i)->super.proc_name);
+                *OMPI_CAST_RTE_NAME(&ompi_group_get_proc_ptr(comm->c_remote_group, i, true)->super.proc_name);
         }
     }
 
@@ -96,7 +99,7 @@ static void try_kill_peers(ompi_communicator_t *comm,
     for (i = 0; i < ompi_comm_remote_size(comm); ++i) {
         assert(count <= nprocs);
         procs[count++] =
-            *OMPI_CAST_RTE_NAME(&ompi_group_get_proc_ptr(comm->c_remote_group, i)->super.proc_name);
+            *OMPI_CAST_RTE_NAME(&ompi_group_get_proc_ptr(comm->c_remote_group, i, true)->super.proc_name);
     }
 
     if (nprocs > 0) {
diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c
index 0b9af10dba..8c23e17b18 100644
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@@ -400,6 +400,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
     opal_compare_proc = _process_name_compare;
     opal_convert_string_to_process_name = _convert_string_to_process_name;
     opal_convert_process_name_to_string = _convert_process_name_to_string;
+    opal_proc_for_name = ompi_proc_for_name;
 
     /* Register MCA variables */
     if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) {
diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c
index 021bf9b617..7a7305e150 100644
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@@ -64,6 +64,7 @@ int ompi_mpi_event_tick_rate = -1;
 char *ompi_mpi_show_mca_params_string = NULL;
 bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
 bool ompi_mpi_preconnect_mpi = false;
+uint32_t ompi_add_procs_cutoff = 1024;
 
 static bool show_default_mca_params = false;
 static bool show_file_mca_params = false;
@@ -288,6 +289,16 @@ int ompi_mpi_register_params(void)
         ompi_rte_abort(1, NULL);
     }
 
+    ompi_add_procs_cutoff = 1024;
+    (void) mca_base_var_register ("ompi", "mpi", NULL, "add_procs_cutoff",
+                                  "Maximum world size for pre-allocating resources for all "
+                                  "remote processes. Increasing this limit may improve "
+                                  "communication performance at the cost of memory usage "
+                                  "(default: 1024)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL,
+                                  0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
+                                  &ompi_add_procs_cutoff);
+
+
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h
index e5edda3825..495f0f36fa 100644
--- a/ompi/runtime/params.h
+++ b/ompi/runtime/params.h
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -9,7 +10,7 @@
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
- * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * Copyright (c) 2006-2009 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2013      NVIDIA Corporation.  All rights reserved.
@@ -123,11 +124,16 @@ OMPI_DECLSPEC extern bool ompi_have_sparse_group_storage;
  */
 OMPI_DECLSPEC extern bool ompi_use_sparse_group_storage;
 
-/*
+/**
  * Cutoff point for retrieving hostnames
  */
 OMPI_DECLSPEC extern uint32_t ompi_direct_modex_cutoff;
 
+/**
+ * Cutoff point for calling add_procs for all processes
+ */
+OMPI_DECLSPEC extern uint32_t ompi_add_procs_cutoff;
+
 /**
  * Register MCA parameters used by the MPI layer.
  *
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 885a6fc0f4..b32a3d3c88 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -605,12 +605,15 @@ typedef int (*mca_btl_base_module_finalize_fn_t)(
  * modex_recv() function. The BTL may utilize this information to
  * determine reachability of each peer process.
  *
- * For each process that is reachable by the BTL, the bit corresponding to the index
- * into the proc array (nprocs) should be set in the reachable bitmask. The BTL
- * will return an array of pointers to a data structure defined
- * by the BTL that is then returned to the BTL on subsequent calls to the BTL data
- * transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing
- * or connection information (e.g. TCP socket, IB queue pair).
+ * The caller may pass a "reachable" bitmap pointer.  If it is not
+ * NULL, for each process that is reachable by the BTL, the bit
+ * corresponding to the index into the proc array (nprocs) should be
+ * set in the reachable bitmask. The BTL will return an array of
+ * pointers to a data structure defined by the BTL that is then
+ * returned to the BTL on subsequent calls to the BTL data transfer
+ * functions (e.g btl_send). This may be used by the BTL to cache any
+ * addressing or connection information (e.g. TCP socket, IB queue
+ * pair).
  */
 typedef int (*mca_btl_base_module_add_procs_fn_t)(
     struct mca_btl_base_module_t* btl,
diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c
index 0cff59f2ab..9f82110cef 100644
--- a/opal/mca/btl/openib/btl_openib.c
+++ b/opal/mca/btl/openib/btl_openib.c
@@ -871,6 +871,7 @@ int mca_btl_openib_add_procs(
     for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
         struct opal_proc_t* proc = procs[i];
         mca_btl_openib_proc_t* ib_proc;
+        bool found_existing = false;
         int remote_matching_port;
 
         opal_output(-1, "add procs: adding proc %d", i);
@@ -898,6 +899,24 @@ int mca_btl_openib_add_procs(
             continue;
         }
 
+        OPAL_THREAD_LOCK(&ib_proc->proc_lock);
+        for (j = 0 ; j < (int) ib_proc->proc_endpoint_count ; ++j) {
+            endpoint = ib_proc->proc_endpoints[j];
+            if (endpoint->endpoint_btl == openib_btl) {
+                found_existing = true;
+                break;
+            }
+        }
+        OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
+
+        if (found_existing) {
+            if (reachable) {
+                opal_bitmap_set_bit(reachable, i);
+            }
+            peers[i] = endpoint;
+            continue;
+        }
+
         /* check if the remote proc has any ports that:
            - on the same subnet as the local proc, and
            - on that subnet, has a CPC in common with the local proc
@@ -1048,6 +1067,37 @@ int mca_btl_openib_add_procs(
     return OPAL_SUCCESS;
 }
 
+struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, struct opal_proc_t *proc)
+{
+    mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl;
+    mca_btl_base_endpoint_t *endpoint;
+    mca_btl_openib_proc_t *ib_proc;
+
+    if (NULL == (ib_proc = mca_btl_openib_proc_create(proc))) {
+        /* if we don't have connection info for this process, it's
+         * okay because some other method might be able to reach it,
+         * so just mark it as unreachable by us */
+        return NULL;
+    }
+
+    OPAL_THREAD_LOCK(&ib_proc->proc_lock);
+    for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) {
+        endpoint = ib_proc->proc_endpoints[j];
+        if (endpoint->endpoint_btl == openib_btl) {
+            OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
+            return endpoint;
+        }
+    }
+    OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
+
+    BTL_VERBOSE(("creating new endpoint for remote process {.jobid = 0x%x, .vpid = 0x%x}",
+                 proc->proc_name.jobid, proc->proc_name.vpid));
+
+    endpoint = NULL;
+    (void) mca_btl_openib_add_procs (btl, 1, &proc, &endpoint, NULL);
+    return endpoint;
+}
+
 /*
  * delete the proc as reachable from this btl module
  */
diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h
index 92506bc5d0..6e1c5ca50f 100644
--- a/opal/mca/btl/openib/btl_openib.h
+++ b/opal/mca/btl/openib/btl_openib.h
@@ -874,6 +874,18 @@ int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
 
 const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
 
+/**
+ * Get an endpoint for a process
+ *
+ * @param btl (IN)    BTL module
+ * @param proc (IN)   opal process object
+ *
+ * This function will return an existing endpoint if one exists otherwise it will allocate
+ * a new endpoint and return it.
+ */
+struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl,
+                                                       struct opal_proc_t *proc);
+
 /**
  * Get a transport type of btl.
  */
diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c
index 8cc9384417..d366443549 100644
--- a/opal/mca/btl/openib/btl_openib_mca.c
+++ b/opal/mca/btl/openib/btl_openib_mca.c
@@ -565,7 +565,8 @@ int btl_openib_register_mca_params(void)
     mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
     mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
     mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
-	MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
+	MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA |
+        MCA_BTL_FLAGS_SEND;
 #if BTL_OPENIB_FAILOVER_ENABLED
     mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT;
 #endif
diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
index f438dfcdc8..6074473c05 100644
--- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
+++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c
@@ -218,6 +218,7 @@ typedef struct udcm_msg_hdr {
     union {
         /* UDCM_MESSAGE_CONNECT */
         struct msg_connect {
+            opal_process_name_t rem_name;
             int32_t rem_ep_index;
             uint8_t rem_port_num;
         } req;
@@ -1473,36 +1474,26 @@ static int udcm_rc_qp_create_all (mca_btl_base_endpoint_t *lcl_ep)
 /* JMS: optimization target -- can we send something in private
  data to find the proc directly instead of having to search
  through *all* procs? */
-static mca_btl_openib_endpoint_t *udcm_find_endpoint (opal_pointer_array_t *endpoints,
+static mca_btl_openib_endpoint_t *udcm_find_endpoint (struct mca_btl_openib_module_t *btl,
                                                       uint32_t qp_num, uint16_t lid,
                                                       udcm_msg_hdr_t *msg_hdr)
 {
-    uint8_t port_num;
-    int i;
+    mca_btl_base_endpoint_t *endpoint;
+    struct opal_proc_t *opal_proc;
 
-    port_num = msg_hdr->data.req.rem_port_num;
-
-    for (i = 0 ; i < opal_pointer_array_get_size (endpoints) ; ++i) {
-        mca_btl_openib_endpoint_t *endpoint;
-        modex_msg_t *msg;
-
-        endpoint = (mca_btl_openib_endpoint_t *)
-        opal_pointer_array_get_item (endpoints, i);
-        if (NULL == endpoint) {
-            continue;
-        }
-
-        msg = UDCM_ENDPOINT_REM_MODEX(endpoint);
-
-        if (msg->mm_qp_num == qp_num && msg->mm_port_num == port_num &&
-            msg->mm_lid == lid)
-            return endpoint;
+    opal_proc = opal_proc_for_name (msg_hdr->data.req.rem_name);
+    if (NULL == opal_proc) {
+        BTL_ERROR(("could not get proc associated with remote peer"));
+        return NULL;
     }
 
-    BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d",
-               port_num, lid, msg_hdr->type));
+    endpoint = mca_btl_openib_get_ep (&btl->super, opal_proc);
+    if (NULL == endpoint) {
+        BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d",
+                   msg_hdr->data.req.rem_port_num, lid, msg_hdr->type));
+    }
 
-    return NULL;
+    return endpoint;
 }
 
 static int udcm_endpoint_init_data (mca_btl_base_endpoint_t *lcl_ep)
@@ -1678,6 +1669,7 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
 
     msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index);
     msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num;
+    msg->data->hdr.data.req.rem_name     = OPAL_PROC_MY_NAME;
 
     for (i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) {
         msg->data->qps[i].psn    = htonl(lcl_ep->qps[i].qp->lcl_psn);
@@ -1981,8 +1973,7 @@ static int udcm_process_messages (struct ibv_cq *event_cq, udcm_module_t *m)
         lcl_ep = message->hdr.lcl_ep;
 
         if (NULL == lcl_ep) {
-            lcl_ep = udcm_find_endpoint (m->btl->device->endpoints, wc[i].src_qp,
-                                         wc[i].slid, &message->hdr);
+            lcl_ep = udcm_find_endpoint (m->btl, wc[i].src_qp, wc[i].slid, &message->hdr);
         }
 
         if (NULL == lcl_ep ) {
@@ -2824,6 +2815,7 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_
 
     msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index);
     msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num;
+    msg->data->hdr.data.req.rem_name     = OPAL_PROC_MY_NAME;
 
     if (UDCM_MESSAGE_XCONNECT == msg_type) {
         BTL_VERBOSE(("Sending XConnect with qp: %d, psn: %d", lcl_ep->qps[0].qp->lcl_qp->qp_num,
diff --git a/opal/mca/btl/portals4/btl_portals4_component.c b/opal/mca/btl/portals4/btl_portals4_component.c
index 94b4dd3023..8e4f2864f1 100644
--- a/opal/mca/btl/portals4/btl_portals4_component.c
+++ b/opal/mca/btl/portals4/btl_portals4_component.c
@@ -221,7 +221,8 @@ mca_btl_portals4_component_open(void)
     mca_btl_portals4_module.super.btl_min_rdma_pipeline_size = 0;
     mca_btl_portals4_module.super.btl_flags =
         MCA_BTL_FLAGS_RDMA |
-        MCA_BTL_FLAGS_RDMA_MATCHED;
+        MCA_BTL_FLAGS_RDMA_MATCHED |
+        MCA_BTL_FLAGS_SEND;
 
     mca_btl_portals4_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
 
diff --git a/opal/mca/btl/self/btl_self_component.c b/opal/mca/btl/self/btl_self_component.c
index 78e400ea6e..42ea125d44 100644
--- a/opal/mca/btl/self/btl_self_component.c
+++ b/opal/mca/btl/self/btl_self_component.c
@@ -98,7 +98,7 @@ static int mca_btl_self_component_register(void)
     mca_btl_self.btl_rdma_pipeline_send_length = INT_MAX;
     mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX;
     mca_btl_self.btl_min_rdma_pipeline_size = 0;
-    mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE;
+    mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_SEND;
     mca_btl_self.btl_bandwidth = 100;
     mca_btl_self.btl_latency = 0;
     mca_btl_base_param_register(&mca_btl_self_component.super.btl_version,
diff --git a/opal/mca/btl/tcp/btl_tcp.c b/opal/mca/btl/tcp/btl_tcp.c
index c14d655f9b..87e5b0ef15 100644
--- a/opal/mca/btl/tcp/btl_tcp.c
+++ b/opal/mca/btl/tcp/btl_tcp.c
@@ -10,7 +10,7 @@
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
- * Copyright (c) 2006-2014 Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2006-2015 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  *
  * $COPYRIGHT$
@@ -72,6 +72,7 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
         struct opal_proc_t* opal_proc = procs[i];
         mca_btl_tcp_proc_t* tcp_proc;
         mca_btl_base_endpoint_t* tcp_endpoint;
+        bool existing_found = false;
 
         /* Do not create loopback TCP connections */
         if( my_proc == opal_proc ) {
@@ -90,28 +91,43 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
 
         OPAL_THREAD_LOCK(&tcp_proc->proc_lock);
 
-        /* The btl_proc datastructure is shared by all TCP BTL
-         * instances that are trying to reach this destination.
-         * Cache the peer instance on the btl_proc.
-         */
-        tcp_endpoint = OBJ_NEW(mca_btl_tcp_endpoint_t);
-        if(NULL == tcp_endpoint) {
-            OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock);
-            return OPAL_ERR_OUT_OF_RESOURCE;
+        for (int j = 0 ; j < tcp_proc->proc_endpoint_count ; ++j) {
+            tcp_endpoint = tcp_proc->proc_endpoints[j];
+            if (tcp_endpoint->endpoint_btl == tcp_btl) {
+                existing_found = true;
+                break;
+            }
         }
 
-        tcp_endpoint->endpoint_btl = tcp_btl;
-        rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint);
-        if(rc != OPAL_SUCCESS) {
-            OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock);
-            OBJ_RELEASE(tcp_endpoint);
-            continue;
+        if (!existing_found) {
+            /* The btl_proc datastructure is shared by all TCP BTL
+             * instances that are trying to reach this destination.
+             * Cache the peer instance on the btl_proc.
+             */
+            tcp_endpoint = OBJ_NEW(mca_btl_tcp_endpoint_t);
+            if(NULL == tcp_endpoint) {
+                OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock);
+                return OPAL_ERR_OUT_OF_RESOURCE;
+            }
+
+            tcp_endpoint->endpoint_btl = tcp_btl;
+            rc = mca_btl_tcp_proc_insert(tcp_proc, tcp_endpoint);
+            if(rc != OPAL_SUCCESS) {
+                OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock);
+                OBJ_RELEASE(tcp_endpoint);
+                continue;
+            }
+
+            opal_list_append(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint);
         }
 
-        opal_bitmap_set_bit(reachable, i);
         OPAL_THREAD_UNLOCK(&tcp_proc->proc_lock);
+
+        if (NULL != reachable) {
+            opal_bitmap_set_bit(reachable, i);
+        }
+
         peers[i] = tcp_endpoint;
-        opal_list_append(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint);
 
         /* we increase the count of MPI users of the event library
            once per peer, so that we are used until we aren't
diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c
index 4332d2d74c..a43d6453d0 100644
--- a/opal/mca/btl/tcp/btl_tcp_component.c
+++ b/opal/mca/btl/tcp/btl_tcp_component.c
@@ -269,7 +269,8 @@ static int mca_btl_tcp_component_register(void)
                                        MCA_BTL_FLAGS_SEND_INPLACE |
                                        MCA_BTL_FLAGS_NEED_CSUM |
                                        MCA_BTL_FLAGS_NEED_ACK |
-                                       MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
+                                       MCA_BTL_FLAGS_HETEROGENEOUS_RDMA |
+                                       MCA_BTL_FLAGS_SEND;
 
     mca_btl_tcp_module.super.btl_bandwidth = 100;
     mca_btl_tcp_module.super.btl_latency = 100;
diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c
index c86977dde3..c0d3399fb8 100644
--- a/opal/mca/btl/tcp/btl_tcp_proc.c
+++ b/opal/mca/btl/tcp/btl_tcp_proc.c
@@ -14,7 +14,9 @@
  * Copyright (c) 2013-2015 Intel, Inc. All rights reserved
  * Copyright (c) 2014-2015 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2015      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2015 Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -738,6 +740,31 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name)
     opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
                               *name, (void**)&proc);
     OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
+    if (OPAL_UNLIKELY(NULL == proc)) {
+        mca_btl_base_endpoint_t *endpoint;
+        opal_proc_t *opal_proc;
+        int rc;
+
+        BTL_VERBOSE(("adding tcp proc for unknown peer {.jobid = 0x%x, .vpid = 0x%x}",
+                     name->jobid, name->vpid));
+
+        opal_proc = opal_proc_for_name (*name);
+        if (NULL == opal_proc) {
+            return NULL;
+        }
+
+        /* try adding this proc to each btl until */
+        for (int i = 0 ; i < mca_btl_tcp_component.tcp_num_btls ; ++i) {
+            endpoint = NULL;
+            (void) mca_btl_tcp_add_procs (&mca_btl_tcp_component.tcp_btls[i]->super, 1, &opal_proc,
+                                          &endpoint, NULL);
+            if (NULL != endpoint && NULL == proc) {
+                /* get the proc and continue on (could probably just break here) */
+                proc = endpoint->endpoint_proc;
+            }
+        }
+    }
+
     return proc;
 }
 
diff --git a/opal/mca/btl/ugni/btl_ugni.h b/opal/mca/btl/ugni/btl_ugni.h
index 0fad0465bb..e6d9634f58 100644
--- a/opal/mca/btl/ugni/btl_ugni.h
+++ b/opal/mca/btl/ugni/btl_ugni.h
@@ -49,7 +49,7 @@
 
 /* ompi and smsg endpoint attributes */
 typedef struct mca_btl_ugni_endpoint_attr_t {
-    uint64_t proc_id;
+    opal_process_name_t proc_name;
     uint32_t index;
     gni_smsg_attr_t smsg_attr;
     gni_mem_handle_t rmt_irq_mem_hndl;
@@ -67,6 +67,7 @@ typedef struct mca_btl_ugni_module_t {
 
     opal_common_ugni_device_t *device;
 
+    opal_mutex_t endpoint_lock;
     size_t endpoint_count;
     opal_pointer_array_t endpoints;
     opal_hash_table_t id_to_endpoint;
@@ -229,6 +230,8 @@ mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl,
                         struct opal_proc_t **procs,
                         struct mca_btl_base_endpoint_t **peers);
 
+struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc);
+
 /**
  * Initiate an asynchronous send.
  *
diff --git a/opal/mca/btl/ugni/btl_ugni_add_procs.c b/opal/mca/btl/ugni/btl_ugni_add_procs.c
index fbeff5b5f0..8d7f571e7c 100644
--- a/opal/mca/btl/ugni/btl_ugni_add_procs.c
+++ b/opal/mca/btl/ugni/btl_ugni_add_procs.c
@@ -28,13 +28,11 @@ static void
 mca_btl_ugni_module_set_max_reg (mca_btl_ugni_module_t *ugni_module, int nlocal_procs);
 static int mca_btl_ugni_smsg_setup (int nprocs);
 
-int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
-                           size_t nprocs,
-                           struct opal_proc_t **procs,
-                           struct mca_btl_base_endpoint_t **peers,
-                           opal_bitmap_t *reachable) {
+int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs,
+                            struct opal_proc_t **procs,
+                            struct mca_btl_base_endpoint_t **peers,
+                            opal_bitmap_t *reachable) {
     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
-    size_t i;
     int rc;
     void *mmap_start_addr;
 
@@ -59,36 +57,45 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
         }
     }
 
-    for (i = 0 ; i < nprocs ; ++i) {
+    for (size_t i = 0 ; i < nprocs ; ++i) {
         struct opal_proc_t *opal_proc = procs[i];
         uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name);
 
-        if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
-            ugni_module->nlocal_procs++;
+        /* check for an existing endpoint */
+        OPAL_THREAD_LOCK(&ugni_module->endpoint_lock);
+        if (OPAL_SUCCESS != opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) (peers + i))) {
+            if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
+                ugni_module->nlocal_procs++;
 
-            /* ugni is allowed on local processes to provide support for network
-             * atomic operations */
+                /* ugni is allowed on local processes to provide support for network
+                 * atomic operations */
+            }
+
+            /*  Create and Init endpoints */
+            rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc);
+            if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
+                OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
+                BTL_ERROR(("btl/ugni error initializing endpoint"));
+                return rc;
+            }
+
+            /* go ahead and connect the local endpoint for RDMA/CQ write */
+            if (opal_proc == opal_proc_local_get ()) {
+                ugni_module->local_ep = peers[i];
+            }
+
+            /* Add this endpoint to the pointer array. */
+            BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i]));
+            opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]);
+
+            ++ugni_module->endpoint_count;
         }
+        OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
 
-        /*  Create and Init endpoints */
-        rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc);
-        if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
-            BTL_ERROR(("btl/ugni error initializing endpoint"));
-            return rc;
+        /* Set the reachable bit if necessary */
+        if (reachable) {
+            rc = opal_bitmap_set_bit (reachable, i);
         }
-
-        /* go ahead and connect the local endpoint for RDMA/CQ write */
-        if (opal_proc == opal_proc_local_get ()) {
-            ugni_module->local_ep = peers[i];
-        }
-
-        /* Add this endpoint to the pointer array. */
-        BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i]));
-        opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]);
-
-        /* Set the reachable bit */
-        rc = opal_bitmap_set_bit (reachable, i);
-        ++ugni_module->endpoint_count;
     }
 
     mca_btl_ugni_module_set_max_reg (ugni_module, ugni_module->nlocal_procs);
@@ -224,6 +231,41 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl,
     return OPAL_SUCCESS;
 }
 
+
+struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc)
+{
+    mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) module;
+    uint64_t proc_id = mca_btl_ugni_proc_name_to_id(proc->proc_name);
+    mca_btl_base_endpoint_t *ep;
+    int rc;
+
+    OPAL_THREAD_LOCK(&ugni_module->endpoint_lock);
+
+    do {
+        rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep);
+        if (OPAL_SUCCESS == rc) {
+            OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
+            break;
+        }
+
+        /*  Create and Init endpoints */
+        rc = mca_btl_ugni_init_ep (ugni_module, &ep, ugni_module, proc);
+        if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
+            BTL_ERROR(("btl/ugni error initializing endpoint"));
+            break;
+        }
+
+        /* Add this endpoint to the pointer array. */
+        BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) ep));
+        opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, ep);
+    } while (0);
+
+    OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
+
+    return ep;
+}
+
+
 static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
                               mca_mpool_base_registration_t *reg)
 {
diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c
index 5941e18417..1248f2f1c3 100644
--- a/opal/mca/btl/ugni/btl_ugni_component.c
+++ b/opal/mca/btl/ugni/btl_ugni_component.c
@@ -386,8 +386,8 @@ mca_btl_ugni_component_init (int *num_btl_modules,
 static inline int
 mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
 {
+    uint64_t datagram_id, data, proc_id;
     uint32_t remote_addr, remote_id;
-    uint64_t datagram_id, data;
     mca_btl_base_endpoint_t *ep;
     gni_post_state_t post_state;
     gni_ep_handle_t handle;
@@ -425,15 +425,24 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
 
     /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
     if (handle == ugni_module->wildcard_ep) {
-        BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, ugni_module->wc_remote_attr.proc_id));
-        rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint,
-                                               ugni_module->wc_remote_attr.proc_id,
-                                               (void *) &ep);
+        proc_id = mca_btl_ugni_proc_name_to_id (ugni_module->wc_remote_attr.proc_name);
+
+        BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64,
+                     proc_id));
+
+        OPAL_THREAD_LOCK(&ugni_module->endpoint_lock);
+        rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep);
+        OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
+
         /* check if the endpoint is known */
         if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) {
-            BTL_ERROR(("received connection attempt from an unknown peer. rc: %d, ep: %p, id: 0x%" PRIx64,
-                       rc, (void *) ep, ugni_module->wc_remote_attr.proc_id));
-            return OPAL_ERR_NOT_FOUND;
+            struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
+            BTL_VERBOSE(("Got connection request from an unknown peer {jobid = 0x%x, vid = 0x%x}",
+                         ugni_module->wc_remote_attr.proc_name.jobid, ugni_module->wc_remote_attr.proc_name.vpid));
+            ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
+            if (OPAL_UNLIKELY(NULL == ep)) {
+                return rc;
+            }
         }
     } else {
         BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c
index 4977659fc1..7f008c607f 100644
--- a/opal/mca/btl/ugni/btl_ugni_module.c
+++ b/opal/mca/btl/ugni/btl_ugni_module.c
@@ -91,6 +91,7 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
     OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t);
     OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t);
     OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t);
+    OBJ_CONSTRUCT(&ugni_module->endpoint_lock, opal_mutex_t);
     OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
     OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
     OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t);
@@ -208,6 +209,7 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
     OBJ_DESTRUCT(&ugni_module->smsg_mboxes);
     OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb);
     OBJ_DESTRUCT(&ugni_module->id_to_endpoint);
+    OBJ_DESTRUCT(&ugni_module->endpoint_lock);
     OBJ_DESTRUCT(&ugni_module->endpoints);
 
     OBJ_DESTRUCT(&ugni_module->eager_get_pending);
diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.c b/opal/mca/btl/ugni/btl_ugni_smsg.c
index f4f255edfb..5d9ea1eef6 100644
--- a/opal/mca/btl/ugni/btl_ugni_smsg.c
+++ b/opal/mca/btl/ugni/btl_ugni_smsg.c
@@ -27,7 +27,7 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
     mbox->attr.smsg_attr.msg_buffer     = base_reg->base;
     mbox->attr.smsg_attr.buff_size      = mca_btl_ugni_component.smsg_mbox_size;
     mbox->attr.smsg_attr.mem_hndl       = ugni_reg->handle.gni_handle;
-    mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME);
+    mbox->attr.proc_name = OPAL_PROC_MY_NAME;
     mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
 }
 
diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c
index 283d794614..33904eab34 100644
--- a/opal/mca/btl/usnic/btl_usnic_module.c
+++ b/opal/mca/btl/usnic/btl_usnic_module.c
@@ -427,7 +427,7 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
 
     /* Find all the endpoints with a complete set of USD destinations
        and mark them as reachable */
-    for (size_t i = 0; i < nprocs; ++i) {
+    for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
         if (NULL != endpoints[i]) {
             bool happy = true;
             for (int channel = 0; channel < USNIC_NUM_CHANNELS; ++channel) {
diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c
index 79f12aa70b..2f46785ff4 100644
--- a/opal/mca/btl/vader/btl_vader_component.c
+++ b/opal/mca/btl/vader/btl_vader_component.c
@@ -239,8 +239,10 @@ static int mca_btl_vader_component_register (void)
     mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit;
     mca_btl_vader.super.btl_rdma_pipeline_frag_size   = mca_btl_vader.super.btl_eager_limit;
 
+    mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_SEND;
+
     if (MCA_BTL_VADER_NONE != mca_btl_vader_component.single_copy_mechanism) {
-        mca_btl_vader.super.btl_flags     = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
+        mca_btl_vader.super.btl_flags    |= MCA_BTL_FLAGS_RDMA;
         /* Single copy mechanisms should provide better bandwidth */
         mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */
 
@@ -248,7 +250,6 @@ static int mca_btl_vader_component_register (void)
         mca_btl_vader.super.btl_get = (mca_btl_base_module_get_fn_t) mca_btl_vader_dummy_rdma;
         mca_btl_vader.super.btl_put = (mca_btl_base_module_get_fn_t) mca_btl_vader_dummy_rdma;
     } else {
-        mca_btl_vader.super.btl_flags     = MCA_BTL_FLAGS_SEND_INPLACE;
         mca_btl_vader.super.btl_bandwidth = 10000; /* Mbs */
     }
 
diff --git a/opal/util/proc.c b/opal/util/proc.c
index fa9ac41c5d..5fba5fd6a3 100644
--- a/opal/util/proc.c
+++ b/opal/util/proc.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2013      The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
@@ -6,6 +7,8 @@
  * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
  * Copyright (c) 2014      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -162,6 +165,11 @@ static int opal_convert_string_to_jobid_should_never_be_called(opal_jobid_t *job
     return OPAL_ERR_NOT_SUPPORTED;
 }
 
+static struct opal_proc_t *opal_proc_for_name_should_never_be_called (opal_process_name_t name)
+{
+    return NULL;
+}
+
 char* (*opal_process_name_print)(const opal_process_name_t) = opal_process_name_print_should_never_be_called;
 char* (*opal_vpid_print)(const opal_vpid_t) = opal_vpid_print_should_never_be_called;
 char* (*opal_jobid_print)(const opal_jobid_t) = opal_jobid_print_should_never_be_called;
@@ -169,6 +177,7 @@ int (*opal_convert_string_to_process_name)(opal_process_name_t *name, const char
 int (*opal_convert_process_name_to_string)(char** name_string, const opal_process_name_t *name) = opal_convert_process_name_to_string_should_never_be_called;
 char* (*opal_convert_jobid_to_string)(opal_jobid_t jobid) = opal_convert_jobid_to_string_should_never_be_called;
 int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_string) = opal_convert_string_to_jobid_should_never_be_called;
+struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opal_proc_for_name_should_never_be_called;
 
 char* opal_get_proc_hostname(const opal_proc_t *proc)
 {
diff --git a/opal/util/proc.h b/opal/util/proc.h
index 9c642c932c..250430ba3c 100644
--- a/opal/util/proc.h
+++ b/opal/util/proc.h
@@ -136,6 +136,13 @@ OPAL_DECLSPEC extern char* (*opal_jobid_print)(const opal_jobid_t);
 OPAL_DECLSPEC extern char* (*opal_convert_jobid_to_string)(opal_jobid_t jobid);
 OPAL_DECLSPEC extern int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_string);
 
+/**
+ * Lookup an opal_proc_t by name
+ *
+ * @param name (IN) name to lookup
+ */
+OPAL_DECLSPEC extern struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name);
+
 #define OPAL_NAME_PRINT(OPAL_PN)    opal_process_name_print(OPAL_PN)
 #define OPAL_JOBID_PRINT(OPAL_PN)   opal_jobid_print(OPAL_PN)
 #define OPAL_VPID_PRINT(OPAL_PN)    opal_vpid_print(OPAL_PN)
diff --git a/oshmem/mca/scoll/mpi/scoll_mpi_module.c b/oshmem/mca/scoll/mpi/scoll_mpi_module.c
index d50d842fa4..c933512b33 100644
--- a/oshmem/mca/scoll/mpi/scoll_mpi_module.c
+++ b/oshmem/mca/scoll/mpi/scoll_mpi_module.c
@@ -113,6 +113,8 @@ mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority)
     if (NULL == oshmem_group_all) {
         osh_group->ompi_comm = &(ompi_mpi_comm_world.comm);
     } else {
+        int my_rank = MPI_UNDEFINED;
+
         err = ompi_comm_group(&(ompi_mpi_comm_world.comm), &parent_group);
         if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) {
             return NULL;
@@ -132,6 +134,10 @@ mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority)
                     break;
                 }
             }
+            /* NTH: keep track of my rank in the new group for the workaround below */
+            if (ranks[i] == ompi_comm_rank (&ompi_mpi_comm_world.comm)) {
+                my_rank = i;
+            }
         }
 
         err = ompi_group_incl(parent_group, osh_group->proc_count, ranks, &new_group);
@@ -139,6 +145,15 @@ mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority)
             free(ranks);
             return NULL;
         }
+
+        /* NTH: XXX -- WORKAROUND -- The oshmem code overwrites ompi_proc_local_proc with its
+         * own proc but does not update the proc list in comm world or comm self. This causes
+         * the code in ompi_group_incl that updates grp_my_rank to fail. This will cause failures
+         * here and when an application attempts to mix oshmem and mpi so it will really need to
+         * be fixed in oshmem/proc and not here. For now we need to work around a new jenkins
+         * failure so set my group ranking so we do not crash when running ompi_comm_create_group. */
+        new_group->grp_my_rank = my_rank;
+
         err = ompi_comm_create_group(&(ompi_mpi_comm_world.comm), new_group, tag, &newcomm);
         if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) {
             free(ranks);