Fix a breakage in the ranking system
While it may be faster to reverse the order of the assignment loops, it also results in the wrong answer Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
c1c0c02f06
Коммит
322f6c5056
@ -190,9 +190,17 @@ void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)
|
||||
|
||||
ORTE_ACQUIRE_OBJECT(caddy);
|
||||
|
||||
/* move the state machine along */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
|
||||
/* if we don't want to launch, then we at least want
|
||||
* to map so we can see where the procs would have
|
||||
* gone - so skip to the mapping state */
|
||||
if (orte_do_not_launch) {
|
||||
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP);
|
||||
} else {
|
||||
/* move the state machine along */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(caddy);
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -50,6 +50,8 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
bool hnp_alone = true, skiphnp = false;
|
||||
orte_attribute_t *kv;
|
||||
char **alias=NULL, **nalias;
|
||||
orte_proc_t *daemon;
|
||||
orte_job_t *djob;
|
||||
|
||||
/* get the number of nodes */
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
|
||||
@ -76,6 +78,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we are not launching, get the daemon job */
|
||||
djob = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* get the hnp node's info */
|
||||
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
||||
|
||||
@ -189,6 +194,21 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (orte_do_not_launch) {
|
||||
/* create a daemon for this node since we won't be launching
|
||||
* and the mapper needs to see a daemon - this is used solely
|
||||
* for testing the mappers */
|
||||
daemon = OBJ_NEW(orte_proc_t);
|
||||
daemon->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
daemon->name.vpid = node->index;
|
||||
daemon->state = ORTE_PROC_STATE_RUNNING;
|
||||
OBJ_RETAIN(node);
|
||||
daemon->node = node;
|
||||
opal_pointer_array_set_item(djob->procs, daemon->name.vpid, daemon);
|
||||
djob->num_procs++;
|
||||
OBJ_RETAIN(daemon);
|
||||
node->daemon = daemon;
|
||||
}
|
||||
/* update the total slots in the job */
|
||||
orte_ras_base.total_slots_alloc += node->slots;
|
||||
/* check if we have fqdn names in the allocation */
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -23,6 +23,7 @@
|
||||
#include "opal/mca/hwloc/hwloc-internal.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
@ -179,6 +180,10 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
|
||||
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
|
||||
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
|
||||
/* pass it thru the filter so we create the summaries required by the mappers */
|
||||
if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
}
|
||||
/* add it to our array */
|
||||
t = OBJ_NEW(orte_topology_t);
|
||||
t->topo = topo;
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -246,7 +246,7 @@ static int bind_downwards(orte_job_t *jdata,
|
||||
hwloc_obj_type_t target,
|
||||
unsigned cache_level)
|
||||
{
|
||||
int j;
|
||||
int j, rc;
|
||||
orte_job_map_t *map;
|
||||
orte_proc_t *proc;
|
||||
hwloc_obj_t trg_obj, nxt_obj;
|
||||
@ -367,7 +367,10 @@ static int bind_downwards(orte_job_t *jdata,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
} else {
|
||||
opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset);
|
||||
rc = opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
opal_output(orte_rmaps_base_framework.framework_output,
|
||||
"%s BOUND PROC %s[%s] TO %s: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -841,7 +844,8 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) {
|
||||
if (!orte_no_vm && !orte_do_not_launch &&
|
||||
(int)ORTE_PROC_MY_NAME->vpid != node->index) {
|
||||
continue;
|
||||
}
|
||||
if (!orte_do_not_launch) {
|
||||
|
@ -417,7 +417,33 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
|
||||
if (orte_do_not_launch) {
|
||||
/* compute the ranks and add the proc objects
|
||||
* to the jdata->procs array */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
goto cleanup;
|
||||
}
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
goto cleanup;
|
||||
}
|
||||
/* compute and save location assignments */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
goto cleanup;
|
||||
}
|
||||
/* compute and save bindings */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
goto cleanup;
|
||||
}
|
||||
} else if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
|
||||
/* compute and save location assignments */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -454,6 +480,11 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
|
||||
if (orte_do_not_launch) {
|
||||
/* display the devel map */
|
||||
orte_rmaps_base_display_map(jdata);
|
||||
}
|
||||
|
||||
/* set the job state to the next position */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -379,35 +379,34 @@ static int rank_by(orte_job_t *jdata,
|
||||
all_done = false;
|
||||
while (!all_done && cnt < app->num_procs) {
|
||||
all_done = true;
|
||||
/* cycle across the objects */
|
||||
for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) {
|
||||
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
|
||||
/* find the next proc for this job and app_context */
|
||||
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that are already ranked */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other apps */
|
||||
if (proc->app_idx != app->idx) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other jobs */
|
||||
if (proc->name.jobid != jdata->jobid) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* ignore procs that are already ranked */
|
||||
if (ORTE_VPID_INVALID != proc->name.vpid) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* ignore procs from other apps */
|
||||
if (proc->app_idx != app->idx) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d",
|
||||
ORTE_NAME_PRINT(&proc->name), num_ranked);
|
||||
continue;
|
||||
}
|
||||
/* cycle across the objects */
|
||||
for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) {
|
||||
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
|
||||
/* protect against bozo case */
|
||||
locale = NULL;
|
||||
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
|
||||
@ -429,7 +428,8 @@ static int rank_by(orte_job_t *jdata,
|
||||
}
|
||||
cnt++;
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
|
||||
"mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s",
|
||||
j, i, ORTE_VPID_PRINT(proc->name.vpid));
|
||||
/* insert the proc into the jdata array */
|
||||
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
|
||||
OBJ_RELEASE(pptr);
|
||||
@ -440,7 +440,8 @@ static int rank_by(orte_job_t *jdata,
|
||||
OBJ_DESTRUCT(&objs);
|
||||
return rc;
|
||||
}
|
||||
/* flag that one was mapped */
|
||||
num_ranked++;
|
||||
/* flag that one was mapped */
|
||||
all_done = false;
|
||||
/* track where the highest vpid landed - this is our
|
||||
* new bookmark
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user