Merge pull request #4965 from rhc54/topic/rank

Fix breakage in ranking system and silence OSC/RDMA warnings
2018-03-26 19:10:36 -05:00 · 2018-03-26 19:10:36 -05:00 · f92acd735b
--- a/3
+++ b/3
@ -19,7 +19,7 @@ Copyright (c) 2012      Oak Ridge National Labs.  All rights reserved.
 Copyright (c) 2012      Sandia National Laboratories.  All rights reserved.
 Copyright (c) 2012      University of Houston. All rights reserved.
 Copyright (c) 2013      NVIDIA Corporation.  All rights reserved.
-Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
+Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
 Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights
                        reserved.
 $COPYRIGHT$
@ -71,6 +71,7 @@ Master (not on release branches yet)
 - Remove IB XRC support from the OpenIB BTL due to lack of support.
 - Remove support for big endian PowerPC.
 - Remove support for XL compilers older than v13.1
+- Fix rank-by algorithms to properly rank by object and span

 3.0.0 -- September, 2017
 ------------------------
--- a/ompi/mca/osc/rdma/osc_rdma.h
+++ b/ompi/mca/osc/rdma/osc_rdma.h
@ -12,7 +12,7 @@
 *                         reserved.
 * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2012-2013 Sandia National Laboratories.  All rights reserved.
- * Copyright (c) 2016      Intel, Inc.  All rights reserved.
+ * Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -568,7 +568,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)

        OPAL_THREAD_SCOPED_LOCK(&sync->lock,
                                OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) {
-                                    fprintf (stderr, "Flushing aggregation %p, peeer %p\n", aggregation, aggregation->peer);
+                                    fprintf (stderr, "Flushing aggregation %p, peer %p\n", (void*)aggregation, (void*)aggregation->peer);
                                    ompi_osc_rdma_peer_aggregate_flush (aggregation->peer);
                                });
    }
--- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c
+++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c
@ -4,7 +4,7 @@
 *                         reserved.
 * Copyright (c) 2016-2017 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2016      Intel, Inc.  All rights reserved.
+ * Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -50,6 +50,7 @@ struct ompi_osc_rdma_event_t {

 typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t;

+#if 0
 static void *ompi_osc_rdma_event_put (int fd, int flags, void *context)
 {
    ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context;
@ -112,7 +113,7 @@ static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca

    return OMPI_SUCCESS;
 }
-
+#endif

 static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
                                     void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
@ -188,10 +189,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
                                             ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
 {
    ompi_osc_rdma_module_t *module = sync->module;
-    const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
    unsigned long len = target_count * target_datatype->super.size;
-    ompi_osc_rdma_frag_t *frag = NULL;
-    volatile bool complete = false;
    char *ptr = NULL;
    int ret;

@ -523,7 +521,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
    ompi_osc_rdma_module_t *module = sync->module;
    int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
    int ret, btl_op, flags;
-    int64_t origin, result;
+    int64_t origin;

    if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
        (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
@ -590,13 +588,13 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
        new_value = old_value;

        if (&ompi_mpi_op_replace.op == op) {
-            memcpy ((void *)((intptr_t) &new_value) + offset, origin_addr, extent);
+            memcpy ((void *)((intptr_t) &new_value + offset), origin_addr, extent);
        } else if (&ompi_mpi_op_no_op.op != op) {
-            ompi_op_reduce (op, (void *) origin_addr, (void *)((intptr_t) &new_value) + offset, 1, dt);
+            ompi_op_reduce (op, (void *) origin_addr, (void*)((intptr_t) &new_value + offset), 1, dt);
        }

        ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
-                                       old_value, new_value, 0, &new_value);
+                                       old_value, new_value, 0, (int64_t*)&new_value);
        if (OPAL_SUCCESS != ret || new_value == old_value) {
            break;
        }
@ -605,7 +603,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
    } while (1);

    if (result_addr) {
-        memcpy (result_addr, (void *)((intptr_t) &new_value) + offset, extent);
+        memcpy (result_addr, (void *)((intptr_t) &new_value + offset), extent);
    }

    if (OPAL_SUCCESS == ret) {
@ -696,11 +694,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
                            mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
 {
    ompi_osc_rdma_module_t *module = sync->module;
-    const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
-    unsigned long offset, aligned_len, len = datatype->super.size;
+    unsigned long len = datatype->super.size;
    mca_btl_base_registration_handle_t *local_handle = NULL;
    ompi_osc_rdma_frag_t *frag = NULL;
-    ompi_osc_rdma_request_t *request;
    volatile bool complete = false;
    /* drop the const. this code will not attempt to change the value */
    char *ptr = (char *) source_addr;
--- a/ompi/mca/osc/rdma/osc_rdma_active_target.c
+++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c
@ -16,7 +16,7 @@
 * Copyright (c) 2017      The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
- * Copyright (c) 2017      Intel, Inc. All rights reserved.
+ * Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -80,7 +80,7 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b
 {
    ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;

-    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", pending_op, status);
+    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status);

    if (pending_op->op_result) {
        memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
@ -296,7 +296,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
 {
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_osc_rdma_peer_t **peers;
-    int my_rank = ompi_comm_rank (module->comm);
    ompi_osc_rdma_state_t *state = module->state;
    int ret = OMPI_SUCCESS;

--- a/ompi/mca/osc/rdma/osc_rdma_comm.c
+++ b/ompi/mca/osc/rdma/osc_rdma_comm.c
@ -2,7 +2,7 @@
 /*
 * Copyright (c) 2014-2018 Los Alamos National Security, LLC.  All rights
 *                         reserved.
- * Copyright (c) 2016      Intel, Inc.  All rights reserved.
+ * Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
 * Copyright (c) 2017      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * Copyright (c) 2017      IBM Corporation. All rights reserved.
@ -492,6 +492,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee
    return ret;
 }

+#if 0
 static void ompi_osc_rdma_aggregate_append (ompi_osc_rdma_aggregation_t *aggregation, ompi_osc_rdma_request_t *request,
                                            void *source_buffer, size_t size)
 {
@ -550,13 +551,16 @@ static int ompi_osc_rdma_aggregate_alloc (ompi_osc_rdma_sync_t *sync, ompi_osc_r

    return OMPI_SUCCESS;
 }
+#endif

 int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
                              mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
                              ompi_osc_rdma_request_t *request)
 {
    ompi_osc_rdma_module_t *module = sync->module;
+#if 0
    ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate;
+#endif
    mca_btl_base_registration_handle_t *local_handle = NULL;
    mca_btl_base_rdma_completion_fn_t cbfunc = NULL;
    ompi_osc_rdma_frag_t *frag = NULL;
--- a/ompi/mca/osc/rdma/osc_rdma_passive_target.c
+++ b/ompi/mca/osc/rdma/osc_rdma_passive_target.c
@ -12,6 +12,7 @@
 *                         reserved.
 * Copyright (c) 2010      IBM Corporation.  All rights reserved.
 * Copyright (c) 2012-2013 Sandia National Laboratories.  All rights reserved.
+ * Copyright (c) 2018      Intel, Inc. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -202,7 +203,7 @@ int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdm
    } while (0);
    );

-    return OMPI_SUCCESS;
+    return ret;
 }

 int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
--- a/orte/mca/plm/base/plm_base_launch_support.c
+++ b/orte/mca/plm/base/plm_base_launch_support.c
@ -190,9 +190,17 @@ void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)

    ORTE_ACQUIRE_OBJECT(caddy);

-    /* move the state machine along */
-    caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
-    ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
+    /* if we don't want to launch, then we at least want
+     * to map so we can see where the procs would have
+     * gone - so skip to the mapping state */
+    if (orte_do_not_launch) {
+        caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
+        ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP);
+    } else {
+        /* move the state machine along */
+        caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
+        ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
+    }

    /* cleanup */
    OBJ_RELEASE(caddy);
--- a/orte/mca/ras/base/ras_base_node.c
+++ b/orte/mca/ras/base/ras_base_node.c
@ -11,7 +11,7 @@
 *                         All rights reserved.
 * Copyright (c) 2011-2017 Los Alamos National Security, LLC.  All rights
 *                         reserved.
- * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
+ * Copyright (c) 2014-2018 Intel, Inc.  All rights reserved.
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * $COPYRIGHT$
@ -50,6 +50,8 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
    bool hnp_alone = true, skiphnp = false;
    orte_attribute_t *kv;
    char **alias=NULL, **nalias;
+    orte_proc_t *daemon;
+    orte_job_t *djob;

    /* get the number of nodes */
    num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
@ -76,6 +78,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
        return rc;
    }

+    /* if we are not launching, get the daemon job */
+    djob = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+
    /* get the hnp node's info */
    hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);

@ -189,6 +194,21 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
                ORTE_ERROR_LOG(rc);
                return rc;
            }
+            if (orte_do_not_launch) {
+                /* create a daemon for this node since we won't be launching
+                 * and the mapper needs to see a daemon - this is used solely
+                 * for testing the mappers */
+                daemon = OBJ_NEW(orte_proc_t);
+                daemon->name.jobid = ORTE_PROC_MY_NAME->jobid;
+                daemon->name.vpid = node->index;
+                daemon->state = ORTE_PROC_STATE_RUNNING;
+                OBJ_RETAIN(node);
+                daemon->node = node;
+                opal_pointer_array_set_item(djob->procs, daemon->name.vpid, daemon);
+                djob->num_procs++;
+                OBJ_RETAIN(daemon);
+                node->daemon = daemon;
+            }
            /* update the total slots in the job */
            orte_ras_base.total_slots_alloc += node->slots;
            /* check if we have fqdn names in the allocation */
--- a/orte/mca/ras/simulator/ras_sim_module.c
+++ b/orte/mca/ras/simulator/ras_sim_module.c
@ -3,7 +3,7 @@
 * Copyright (c) 2012      Los Alamos National Security, LLC. All rights reserved
 * Copyright (c) 2015-2017 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2015-2017 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2015-2018 Intel, Inc.  All rights reserved.
 *
 * $COPYRIGHT$
 *
@ -23,6 +23,7 @@
 #include "opal/mca/hwloc/hwloc-internal.h"
 #include "opal/util/argv.h"

+#include "orte/mca/errmgr/errmgr.h"
 #include "orte/util/show_help.h"
 #include "orte/runtime/orte_globals.h"

@ -179,6 +180,10 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
            support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
            support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
+            /* pass it thru the filter so we create the summaries required by the mappers */
+            if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) {
+                ORTE_ERROR_LOG(ORTE_ERROR);
+            }
            /* add it to our array */
            t = OBJ_NEW(orte_topology_t);
            t->topo = topo;
--- a/orte/mca/rmaps/base/rmaps_base_binding.c
+++ b/orte/mca/rmaps/base/rmaps_base_binding.c
@ -12,7 +12,7 @@
 * Copyright (c) 2011-2014 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
 *                         All rights reserved.
- * Copyright (c) 2013-2017 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
 * Copyright (c) 2015-2017 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * $COPYRIGHT$
@ -246,7 +246,7 @@ static int bind_downwards(orte_job_t *jdata,
                          hwloc_obj_type_t target,
                          unsigned cache_level)
 {
-    int j;
+    int j, rc;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t trg_obj, nxt_obj;
@ -367,7 +367,10 @@ static int bind_downwards(orte_job_t *jdata,
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name);
            } else {
-                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset);
+                rc = opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset);
+                if (OPAL_SUCCESS != rc) {
+                    ORTE_ERROR_LOG(rc);
+                }
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s BOUND PROC %s[%s] TO %s: %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -841,7 +844,8 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
            continue;
        }
-        if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) {
+        if (!orte_no_vm && !orte_do_not_launch &&
+            (int)ORTE_PROC_MY_NAME->vpid != node->index) {
            continue;
        }
        if (!orte_do_not_launch) {
--- a/orte/mca/rmaps/base/rmaps_base_map_job.c
+++ b/orte/mca/rmaps/base/rmaps_base_map_job.c
@ -417,7 +417,33 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
        }
    }

-    if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
+    if (orte_do_not_launch) {
+        /* compute the ranks and add the proc objects
+         * to the jdata->procs array */
+        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
+            ORTE_ERROR_LOG(rc);
+            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
+            goto cleanup;
+        }
+        /* compute and save local ranks */
+        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
+            ORTE_ERROR_LOG(rc);
+            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
+            goto cleanup;
+        }
+        /* compute and save location assignments */
+        if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
+            ORTE_ERROR_LOG(rc);
+            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
+            goto cleanup;
+        }
+        /* compute and save bindings */
+        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
+            ORTE_ERROR_LOG(rc);
+            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
+            goto cleanup;
+        }
+    } else if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
        /* compute and save location assignments */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
            ORTE_ERROR_LOG(rc);
@ -454,6 +480,11 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
        }
    }

+    if (orte_do_not_launch) {
+        /* display the devel map */
+        orte_rmaps_base_display_map(jdata);
+    }
+
    /* set the job state to the next position */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);

--- a/orte/mca/rmaps/base/rmaps_base_ranking.c
+++ b/orte/mca/rmaps/base/rmaps_base_ranking.c
@ -10,7 +10,7 @@
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2011-2017 Cisco Systems, Inc.  All rights reserved
- * Copyright (c) 2014-2017 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2014-2018 Intel, Inc.  All rights reserved.
 * Copyright (c) 2017      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * $COPYRIGHT$
@ -304,15 +304,15 @@ static int rank_by(orte_job_t *jdata,
 {
    orte_app_context_t *app;
    hwloc_obj_t obj;
-    int num_objs, i, j, m, n, rc;
+    int num_objs, i, j, m, n, rc, nn;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc, *pptr;
-    orte_vpid_t vpid;
+    orte_vpid_t vpid, np;
    int cnt;
    opal_pointer_array_t objs;
-    bool all_done;
    hwloc_obj_t locale;
+    orte_app_idx_t napp;

    if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
        return rank_span(jdata, target, cache_level);
@ -333,20 +333,21 @@ static int rank_by(orte_job_t *jdata,
     */

    vpid = 0;
-    for (n=0; n < jdata->apps->size; n++) {
+    for (n=0, napp=0; napp < jdata->num_apps && n < jdata->apps->size; n++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
            continue;
        }
-
+        napp++;
        /* setup the pointer array */
        OBJ_CONSTRUCT(&objs, opal_pointer_array_t);
        opal_pointer_array_init(&objs, 2, INT_MAX, 2);

        cnt = 0;
-        for (m=0; m < jdata->map->nodes->size; m++) {
+        for (m=0, nn=0; nn < jdata->map->num_nodes && m < jdata->map->nodes->size; m++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
                continue;
            }
+            nn++;

            /* get the number of objects - only consider those we can actually use */
            num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target,
@ -376,80 +377,83 @@ static int rank_by(orte_job_t *jdata,
             * Perhaps someday someone will come up with a more efficient
             * algorithm, but this works for now.
             */
-            all_done = false;
-            while (!all_done && cnt < app->num_procs) {
-                all_done = true;
-                /* cycle across the objects */
-                for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) {
-                    obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
-                    /* find the next proc for this job and app_context */
-                    for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
-                        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
-                            continue;
-                        }
-                        /* ignore procs from other jobs */
-                        if (proc->name.jobid != jdata->jobid) {
-                            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
-                                                "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
-                                                ORTE_NAME_PRINT(&proc->name), num_ranked);
-                            continue;
-                        }
-                        /* ignore procs that are already ranked */
-                        if (ORTE_VPID_INVALID != proc->name.vpid) {
-                            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
-                                                "mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d",
-                                                ORTE_NAME_PRINT(&proc->name), num_ranked);
-                            continue;
-                        }
-                        /* ignore procs from other apps */
-                        if (proc->app_idx != app->idx) {
-                            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
-                                                "mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d",
-                                                ORTE_NAME_PRINT(&proc->name), num_ranked);
-                            continue;
-                        }
-                         /* protect against bozo case */
-                        locale = NULL;
-                        if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
-                            ORTE_ERROR_LOG(ORTE_ERROR);
-                            return ORTE_ERROR;
-                        }
-                        /* ignore procs not on this object */
-                        if (NULL == locale ||
-                            !hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
-                            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
-                                                "mca:rmaps:rank_by: proc at position %d is not on object %d",
-                                                j, i);
-                            continue;
-                        }
-                        /* assign the vpid */
-                        proc->name.vpid = vpid++;
-                        if (0 == cnt) {
-                            app->first_rank = proc->name.vpid;
-                        }
-                        cnt++;
-                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
-                                            "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
-                        /* insert the proc into the jdata array */
-                        if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
-                            OBJ_RELEASE(pptr);
-                        }
-                        OBJ_RETAIN(proc);
-                        if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
-                            ORTE_ERROR_LOG(rc);
-                            OBJ_DESTRUCT(&objs);
-                            return rc;
-                        }
-                       /* flag that one was mapped */
-                        all_done = false;
-                        /* track where the highest vpid landed - this is our
-                         * new bookmark
-                         */
-                        jdata->bookmark = node;
-                        /* move to next object */
-                        break;
-                    }
+            i = 0;
+            while (cnt < app->num_procs) {
+                /* get the next object */
+                obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
+                if (NULL == obj) {
+                    break;
                }
+                /* scan across the procs and find the one that is on this object */
+                np = 0;
+                for (j=0; np < node->num_procs && j < node->procs->size && cnt < app->num_procs; j++) {
+                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
+                        continue;
+                    }
+                    np++;
+                    /* ignore procs from other jobs */
+                    if (proc->name.jobid != jdata->jobid) {
+                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                                            "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
+                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
+                        continue;
+                    }
+                    /* ignore procs that are already ranked */
+                    if (ORTE_VPID_INVALID != proc->name.vpid) {
+                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                                            "mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d",
+                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
+                        continue;
+                    }
+                    /* ignore procs from other apps */
+                    if (proc->app_idx != app->idx) {
+                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                                            "mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d",
+                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
+                        continue;
+                    }
+                     /* protect against bozo case */
+                    locale = NULL;
+                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
+                        ORTE_ERROR_LOG(ORTE_ERROR);
+                        return ORTE_ERROR;
+                    }
+                    /* ignore procs not on this object */
+                    if (NULL == locale ||
+                        !hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
+                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                                            "mca:rmaps:rank_by: proc at position %d is not on object %d",
+                                            j, i);
+                        continue;
+                    }
+                    /* assign the vpid */
+                    proc->name.vpid = vpid++;
+                    if (0 == cnt) {
+                        app->first_rank = proc->name.vpid;
+                    }
+                    cnt++;
+                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                                        "mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s",
+                                        j, i, ORTE_VPID_PRINT(proc->name.vpid));
+                    /* insert the proc into the jdata array */
+                    if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
+                        OBJ_RELEASE(pptr);
+                    }
+                    OBJ_RETAIN(proc);
+                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
+                        ORTE_ERROR_LOG(rc);
+                        OBJ_DESTRUCT(&objs);
+                        return rc;
+                    }
+                    num_ranked++;
+                    /* track where the highest vpid landed - this is our
+                     * new bookmark
+                     */
+                    jdata->bookmark = node;
+                    /* move to next object */
+                    break;
+                }
+                i++;
            }
        }
        /* cleanup */
@ -473,6 +477,9 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)

    map = jdata->map;

+    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                        "RANKING POLICY: %s", orte_rmaps_base_print_ranking(map->ranking));
+
    /* start with the rank-by object options - if the object isn't
     * included in the topology, then we obviously cannot rank by it.
     * However, if this was the default ranking policy (as opposed to