Correctly compute usage for dynamic spawns when binding is invoked. Ensure we correctly account for existing process usage on each node when computing bindings during dynamic spawns.
cmr=v1.7.4:reviewer=hjelmn:subject=Correctly compute usage for dynamic spawns when binding is invoked This commit was SVN r29649.
Этот коммит содержится в:
родитель
6e7ce03aa0
Коммит
e35ad23176
@ -12,6 +12,7 @@
|
|||||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -50,6 +51,35 @@
|
|||||||
|
|
||||||
static bool membind_warned=false;
|
static bool membind_warned=false;
|
||||||
|
|
||||||
|
static void reset_usage(orte_node_t *node, orte_jobid_t jobid)
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
orte_proc_t *proc;
|
||||||
|
opal_hwloc_obj_data_t *data;
|
||||||
|
|
||||||
|
/* start by clearing any existing info */
|
||||||
|
opal_hwloc_base_clear_usage(node->topology);
|
||||||
|
|
||||||
|
/* cycle thru the procs on the node and record
|
||||||
|
* their usage in the topology
|
||||||
|
*/
|
||||||
|
for (j=0; j < node->procs->size; j++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* ignore procs from this job */
|
||||||
|
if (proc->name.jobid == jobid) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (NULL == proc->bind_location) {
|
||||||
|
/* this proc isn't bound - ignore it */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
data = (opal_hwloc_obj_data_t*)proc->bind_location->userdata;
|
||||||
|
data->num_bound++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static int bind_upwards(orte_job_t *jdata,
|
static int bind_upwards(orte_job_t *jdata,
|
||||||
hwloc_obj_type_t target,
|
hwloc_obj_type_t target,
|
||||||
unsigned cache_level)
|
unsigned cache_level)
|
||||||
@ -117,8 +147,13 @@ static int bind_upwards(orte_job_t *jdata,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* clear the topology of any prior usage numbers */
|
if (!orte_hetero_nodes) {
|
||||||
opal_hwloc_base_clear_usage(node->topology);
|
/* if the nodes are homogeneous, we share topologies in order
|
||||||
|
* to save space, so we need to reset the usage info to reflect
|
||||||
|
* our own current state
|
||||||
|
*/
|
||||||
|
reset_usage(node, jdata->jobid);
|
||||||
|
}
|
||||||
|
|
||||||
/* cycle thru the procs */
|
/* cycle thru the procs */
|
||||||
for (j=0; j < node->procs->size; j++) {
|
for (j=0; j < node->procs->size; j++) {
|
||||||
@ -175,6 +210,8 @@ static int bind_upwards(orte_job_t *jdata,
|
|||||||
/* bind it here */
|
/* bind it here */
|
||||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
|
cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
|
||||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
||||||
|
/* record the location */
|
||||||
|
proc->bind_location = obj;
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -267,8 +304,13 @@ static int bind_downwards(orte_job_t *jdata,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* clear the topology of any prior usage numbers */
|
if (!orte_hetero_nodes) {
|
||||||
opal_hwloc_base_clear_usage(node->topology);
|
/* if the nodes are homogeneous, we share topologies in order
|
||||||
|
* to save space, so we need to reset the usage info to reflect
|
||||||
|
* our own current state
|
||||||
|
*/
|
||||||
|
reset_usage(node, jdata->jobid);
|
||||||
|
}
|
||||||
|
|
||||||
/* cycle thru the procs */
|
/* cycle thru the procs */
|
||||||
for (j=0; j < node->procs->size; j++) {
|
for (j=0; j < node->procs->size; j++) {
|
||||||
@ -298,7 +340,9 @@ static int bind_downwards(orte_job_t *jdata,
|
|||||||
hwloc_bitmap_free(totalcpuset);
|
hwloc_bitmap_free(totalcpuset);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
/* start with a clean slate */
|
/* record the location */
|
||||||
|
proc->bind_location = trg_obj;
|
||||||
|
/* start with a clean slate */
|
||||||
hwloc_bitmap_zero(totalcpuset);
|
hwloc_bitmap_zero(totalcpuset);
|
||||||
total_cpus = 0;
|
total_cpus = 0;
|
||||||
nxt_obj = trg_obj;
|
nxt_obj = trg_obj;
|
||||||
@ -333,6 +377,7 @@ static int bind_downwards(orte_job_t *jdata,
|
|||||||
/* bind the proc here */
|
/* bind the proc here */
|
||||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
|
cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
|
||||||
hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
|
hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
|
||||||
|
/* track total #cpus */
|
||||||
total_cpus += ncpus;
|
total_cpus += ncpus;
|
||||||
/* move to the next location, in case we need it */
|
/* move to the next location, in case we need it */
|
||||||
nxt_obj = trg_obj->next_cousin;
|
nxt_obj = trg_obj->next_cousin;
|
||||||
@ -421,10 +466,15 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* clear the topology of any prior usage numbers */
|
if (!orte_hetero_nodes) {
|
||||||
opal_hwloc_base_clear_usage(node->topology);
|
/* if the nodes are homogeneous, we share topologies in order
|
||||||
|
* to save space, so we need to reset the usage info to reflect
|
||||||
|
* our own current state
|
||||||
|
*/
|
||||||
|
reset_usage(node, jdata->jobid);
|
||||||
|
}
|
||||||
|
|
||||||
/* cycle thru the procs */
|
/* cycle thru the procs */
|
||||||
for (j=0; j < node->procs->size; j++) {
|
for (j=0; j < node->procs->size; j++) {
|
||||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||||
continue;
|
continue;
|
||||||
@ -466,6 +516,8 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
/* bind the proc here */
|
/* bind the proc here */
|
||||||
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
|
cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
|
||||||
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
|
||||||
|
/* record the location */
|
||||||
|
proc->bind_location = proc->locale;
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
"%s BOUND PROC %s TO %s[%s:%u] on node %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -491,17 +492,24 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
|
|||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
{
|
{
|
||||||
char *locale=NULL;
|
char *locale=NULL;
|
||||||
|
char *bind = NULL;
|
||||||
|
|
||||||
if (NULL != src->locale) {
|
if (NULL != src->locale) {
|
||||||
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
|
hwloc_bitmap_list_asprintf(&locale, src->locale->cpuset);
|
||||||
}
|
}
|
||||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tBinding: %s", tmp, pfx2,
|
if (NULL != src->bind_location) {
|
||||||
|
hwloc_bitmap_list_asprintf(&bind, src->bind_location->cpuset);
|
||||||
|
}
|
||||||
|
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld\tLocale: %s\tBind location: %s\tBinding: %s", tmp, pfx2,
|
||||||
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
|
orte_proc_state_to_str(src->state), src->restarts, (long)src->app_idx,
|
||||||
(NULL == locale) ? "UNKNOWN" : locale,
|
(NULL == locale) ? "UNKNOWN" : locale, bind,
|
||||||
(NULL == src->cpu_bitmap) ? "NULL" : src->cpu_bitmap);
|
(NULL == src->cpu_bitmap) ? "NULL" : src->cpu_bitmap);
|
||||||
if (NULL != locale) {
|
if (NULL != locale) {
|
||||||
free(locale);
|
free(locale);
|
||||||
}
|
}
|
||||||
|
if (NULL != bind) {
|
||||||
|
free(bind);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld", tmp, pfx2,
|
asprintf(&tmp2, "%s\n%s\tState: %s\tRestarts: %d\tApp_context: %ld", tmp, pfx2,
|
||||||
|
@ -927,6 +927,7 @@ static void orte_proc_construct(orte_proc_t* proc)
|
|||||||
proc->app_idx = 0;
|
proc->app_idx = 0;
|
||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
proc->locale = NULL;
|
proc->locale = NULL;
|
||||||
|
proc->bind_location = NULL;
|
||||||
proc->cpu_bitmap = NULL;
|
proc->cpu_bitmap = NULL;
|
||||||
#endif
|
#endif
|
||||||
proc->node = NULL;
|
proc->node = NULL;
|
||||||
|
@ -509,6 +509,8 @@ struct orte_proc_t {
|
|||||||
#if OPAL_HAVE_HWLOC
|
#if OPAL_HAVE_HWLOC
|
||||||
/* hwloc object to which this process was mapped */
|
/* hwloc object to which this process was mapped */
|
||||||
hwloc_obj_t locale;
|
hwloc_obj_t locale;
|
||||||
|
/* hwloc object to which this process is bound */
|
||||||
|
hwloc_obj_t bind_location;
|
||||||
/* string representation of cpu bindings */
|
/* string representation of cpu bindings */
|
||||||
char *cpu_bitmap;
|
char *cpu_bitmap;
|
||||||
#endif
|
#endif
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user