When doing comm-spawn, track the last object we bound to and ensure that we start the next job on the next object so we avoid overload situations when they aren't necessary
Этот коммит содержится в:
родитель
8ab2b11f88
Коммит
869b2891c4
@ -224,6 +224,8 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
|||||||
jdata->bookmark = parent->bookmark;
|
jdata->bookmark = parent->bookmark;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* provide the parent's last object */
|
||||||
|
jdata->bkmark_obj = parent->bkmark_obj;
|
||||||
|
|
||||||
/* launch it */
|
/* launch it */
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
|
@ -500,6 +500,11 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
|
|||||||
"mca:rmaps:rr: found %u %s objects on node %s",
|
"mca:rmaps:rr: found %u %s objects on node %s",
|
||||||
nobjs, hwloc_obj_type_string(target), node->name);
|
nobjs, hwloc_obj_type_string(target), node->name);
|
||||||
|
|
||||||
|
/* if this is a comm_spawn situation, start with the object
|
||||||
|
* where the parent left off and increment */
|
||||||
|
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
|
||||||
|
start = (jdata->bkmark_obj + 1) % nobjs;
|
||||||
|
}
|
||||||
/* compute the number of procs to go on this node */
|
/* compute the number of procs to go on this node */
|
||||||
nprocs = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
|
nprocs = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
|
||||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||||
|
@ -651,6 +651,7 @@ static void orte_job_construct(orte_job_t* job)
|
|||||||
ORTE_GLOBAL_ARRAY_BLOCK_SIZE);
|
ORTE_GLOBAL_ARRAY_BLOCK_SIZE);
|
||||||
job->map = NULL;
|
job->map = NULL;
|
||||||
job->bookmark = NULL;
|
job->bookmark = NULL;
|
||||||
|
job->bkmark_obj = 0;
|
||||||
job->state = ORTE_JOB_STATE_UNDEF;
|
job->state = ORTE_JOB_STATE_UNDEF;
|
||||||
|
|
||||||
job->num_mapped = 0;
|
job->num_mapped = 0;
|
||||||
|
@ -333,6 +333,9 @@ typedef struct {
|
|||||||
* indicates the node where we stopped
|
* indicates the node where we stopped
|
||||||
*/
|
*/
|
||||||
orte_node_t *bookmark;
|
orte_node_t *bookmark;
|
||||||
|
/* if we are binding, bookmark the index of the
|
||||||
|
* last object we bound to */
|
||||||
|
unsigned int bkmark_obj;
|
||||||
/* state of the overall job */
|
/* state of the overall job */
|
||||||
orte_job_state_t state;
|
orte_job_state_t state;
|
||||||
/* number of procs mapped */
|
/* number of procs mapped */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user