From 869b2891c4e5d4a9c29a3ea270aab4b059525ae0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 17 Jun 2015 09:20:08 -0700 Subject: [PATCH] When doing comm-spawn, track the last object we bound to and ensure that we start the next job on the next object so we avoid overload situations when they aren't necessary --- orte/mca/plm/base/plm_base_receive.c | 2 ++ orte/mca/rmaps/round_robin/rmaps_rr_mappers.c | 5 +++++ orte/runtime/orte_globals.c | 1 + orte/runtime/orte_globals.h | 3 +++ 4 files changed, 11 insertions(+) diff --git a/orte/mca/plm/base/plm_base_receive.c b/orte/mca/plm/base/plm_base_receive.c index a990d711b5..22957dd09c 100644 --- a/orte/mca/plm/base/plm_base_receive.c +++ b/orte/mca/plm/base/plm_base_receive.c @@ -224,6 +224,8 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender, jdata->bookmark = parent->bookmark; } } + /* provide the parent's last object */ + jdata->bkmark_obj = parent->bkmark_obj; /* launch it */ OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index 66c6994fc8..629bad8152 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -500,6 +500,11 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, "mca:rmaps:rr: found %u %s objects on node %s", nobjs, hwloc_obj_type_string(target), node->name); + /* if this is a comm_spawn situation, start with the object + * where the parent left off and increment */ + if (ORTE_JOBID_INVALID != jdata->originator.jobid) { + start = (jdata->bkmark_obj + 1) % nobjs; + } /* compute the number of procs to go on this node */ nprocs = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index bb62393b3c..b849235c03 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -651,6 +651,7 @@ static void orte_job_construct(orte_job_t* job) ORTE_GLOBAL_ARRAY_BLOCK_SIZE); job->map = NULL; job->bookmark = NULL; + job->bkmark_obj = 0; job->state = ORTE_JOB_STATE_UNDEF; job->num_mapped = 0; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 16fcce708f..e6e387a54c 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -333,6 +333,9 @@ typedef struct { * indicates the node where we stopped */ orte_node_t *bookmark; + /* if we are binding, bookmark the index of the + * last object we bound to */ + unsigned int bkmark_obj; /* state of the overall job */ orte_job_state_t state; /* number of procs mapped */