1
1

If (and only if) a user requests, set the default number of slots on any node to the number of objects of the specified type. This *only* takes effect in an unmanaged environment - i.e., if an external resource manager assigns us a number of slots, then that is what we use. However, if we are using a hostfile, then the user may or may not have given us a value for the number of slots on each node.

For those nodes (and *only* those nodes) where the user does *not* specify a slot count, we will set the number of slots according to their direction: either to the number of cores, numas, sockets, or hwthreads. Otherwise, the slot count is set to 1.

Note that the default behavior remains unchanged: in the absence of any value for #slots, and in the absence of any directive to set #slots, we will set #slots=1.

This commit was SVN r27236.
Этот коммит содержится в:
Ralph Castain 2012-09-04 20:58:26 +00:00
родитель ee6c7702d2
Коммит bae5dab916
10 изменённых файлов: 114 добавлений и 37 удалений

Просмотреть файл

@ -107,9 +107,64 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
} }
} }
} }
/* if this is an unmanaged allocation, then set the default
* slots on each node as directed or using default
*/
if (!orte_managed_allocation) {
if (NULL != orte_set_slots) {
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (!node->slots_given) {
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_CORE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_SOCKET, 0,
OPAL_HWLOC_LOGICAL))) {
/* some systems don't report sockets - in this case,
* use numanodes
*/
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
}
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_PU, 0,
OPAL_HWLOC_LOGICAL);
} else {
/* must be a number */
node->slots = strtol(orte_set_slots, NULL, 10);
}
}
}
} else {
/* set any non-specified slot counts to 1 */
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (!node->slots_given) {
node->slots = 1;
}
}
}
}
} }
#endif #endif
if (orte_display_allocation) {
orte_ras_base_display_alloc();
}
/* progress the job */ /* progress the job */
caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED; caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_VM_READY); ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_VM_READY);

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -45,7 +45,6 @@ typedef struct orte_ras_base_t {
int ras_output; int ras_output;
opal_list_t ras_opened; opal_list_t ras_opened;
bool allocation_read; bool allocation_read;
bool display_alloc;
orte_ras_base_module_t *active_module; orte_ras_base_module_t *active_module;
int total_slots_alloc; int total_slots_alloc;
} orte_ras_base_t; } orte_ras_base_t;
@ -59,6 +58,8 @@ ORTE_DECLSPEC int orte_ras_base_select(void);
ORTE_DECLSPEC int orte_ras_base_finalize(void); ORTE_DECLSPEC int orte_ras_base_finalize(void);
ORTE_DECLSPEC int orte_ras_base_close(void); ORTE_DECLSPEC int orte_ras_base_close(void);
ORTE_DECLSPEC void orte_ras_base_display_alloc(void);
ORTE_DECLSPEC void orte_ras_base_allocate(int fd, short args, void *cbdata); ORTE_DECLSPEC void orte_ras_base_allocate(int fd, short args, void *cbdata);
ORTE_DECLSPEC int orte_ras_base_add_hosts(orte_job_t *jdata); ORTE_DECLSPEC int orte_ras_base_add_hosts(orte_job_t *jdata);

Просмотреть файл

@ -48,8 +48,8 @@
#include "orte/mca/ras/base/ras_private.h" #include "orte/mca/ras/base/ras_private.h"
/* static function to display allocation */ /* function to display allocation */
static void display_alloc(void) void orte_ras_base_display_alloc(void)
{ {
char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL; char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL;
int i, istart; int i, istart;
@ -367,8 +367,8 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
DISPLAY: DISPLAY:
/* shall we display the results? */ /* shall we display the results? */
if (4 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) { if (4 < opal_output_get_verbosity(orte_ras_base.ras_output)) {
display_alloc(); orte_ras_base_display_alloc();
} }
next_state: next_state:
@ -470,8 +470,8 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
} }
/* shall we display the results? */ /* shall we display the results? */
if (0 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) { if (0 < opal_output_get_verbosity(orte_ras_base.ras_output)) {
display_alloc(); orte_ras_base_display_alloc();
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -72,30 +72,11 @@ orte_ras_base_t orte_ras_base;
*/ */
int orte_ras_base_open(void) int orte_ras_base_open(void)
{ {
int value;
bool btmp;
/* set default flags */ /* set default flags */
orte_ras_base.active_module = NULL; orte_ras_base.active_module = NULL;
orte_ras_base.allocation_read = false; orte_ras_base.allocation_read = false;
orte_ras_base.total_slots_alloc = 0; orte_ras_base.total_slots_alloc = 0;
/* should we display the allocation after determining it? */
mca_base_param_reg_int_name("ras", "base_display_alloc",
"Whether to display the allocation after it is determined",
false, false, (int)false, &value);
orte_ras_base.display_alloc = OPAL_INT_TO_BOOL(value);
/* should we display a detailed (developer-quality) version of the allocation after determining it? */
mca_base_param_reg_int_name("ras", "base_display_devel_alloc",
"Whether to display a developer-detail allocation after it is determined",
false, false, (int)false, &value);
btmp = OPAL_INT_TO_BOOL(value);
if (btmp) {
orte_ras_base.display_alloc = true;
orte_devel_level_output = true;
}
/* Debugging / verbose output. Always have stream open, with /* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */ verbose set by the mca open system... */
orte_ras_base.ras_output = opal_output_open(NULL); orte_ras_base.ras_output = opal_output_open(NULL);

Просмотреть файл

@ -95,6 +95,9 @@ char **orte_launch_environ;
bool orte_hnp_is_allocated = false; bool orte_hnp_is_allocated = false;
bool orte_allocation_required; bool orte_allocation_required;
bool orte_managed_allocation = false; bool orte_managed_allocation = false;
char *orte_set_slots = NULL;
bool orte_display_allocation;
bool orte_display_devel_allocation;
/* launch agents */ /* launch agents */
char *orte_launch_agent = NULL; char *orte_launch_agent = NULL;
@ -826,6 +829,7 @@ static void orte_node_construct(orte_node_t* node)
node->oversubscribed = false; node->oversubscribed = false;
node->state = ORTE_NODE_STATE_UNKNOWN; node->state = ORTE_NODE_STATE_UNKNOWN;
node->slots = 0; node->slots = 0;
node->slots_given = false;
node->slots_inuse = 0; node->slots_inuse = 0;
node->slots_alloc = 0; node->slots_alloc = 0;
node->slots_max = 0; node->slots_max = 0;

Просмотреть файл

@ -333,6 +333,10 @@ typedef struct {
that we have been allocated on this note and would be the that we have been allocated on this note and would be the
"ideal" number of processes for us to launch. */ "ideal" number of processes for us to launch. */
orte_std_cntr_t slots; orte_std_cntr_t slots;
/* a flag indicating that the number of slots was specified - used
* only in non-managed environments
*/
bool slots_given;
/** How many processes have already been launched, used by one or /** How many processes have already been launched, used by one or
more jobs on this node. */ more jobs on this node. */
orte_std_cntr_t slots_inuse; orte_std_cntr_t slots_inuse;
@ -610,6 +614,9 @@ ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern bool orte_hnp_is_allocated; ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
ORTE_DECLSPEC extern bool orte_allocation_required; ORTE_DECLSPEC extern bool orte_allocation_required;
ORTE_DECLSPEC extern bool orte_managed_allocation; ORTE_DECLSPEC extern bool orte_managed_allocation;
ORTE_DECLSPEC extern char *orte_set_slots;
ORTE_DECLSPEC extern bool orte_display_allocation;
ORTE_DECLSPEC extern bool orte_display_devel_allocation;
/* launch agents */ /* launch agents */
ORTE_DECLSPEC extern char *orte_launch_agent; ORTE_DECLSPEC extern char *orte_launch_agent;

Просмотреть файл

@ -44,7 +44,7 @@ static bool passed_thru = false;
int orte_register_params(void) int orte_register_params(void)
{ {
int value; int value, id;
char *strval, *strval1, *strval2; char *strval, *strval1, *strval2;
/* only go thru this once - mpirun calls it twice, which causes /* only go thru this once - mpirun calls it twice, which causes
@ -553,6 +553,29 @@ int orte_register_params(void)
orte_use_common_port = false; orte_use_common_port = false;
#endif #endif
mca_base_param_reg_string_name("orte", "set_default_slots",
"Set the number of slots on nodes that lack such info to the number of specified objects [a number, \"cores\", \"numas\", \"sockets\", or \"hwthreads\"]",
false, false, NULL, &orte_set_slots);
/* should we display the allocation after determining it? */
id = mca_base_param_reg_int_name("orte", "display_alloc",
"Whether to display the allocation after it is determined",
false, false, (int)false, NULL);
mca_base_param_reg_syn_name(id, "ras", "base_display_alloc", true);
mca_base_param_lookup_int(id, &value);
orte_display_allocation = OPAL_INT_TO_BOOL(value);
/* should we display a detailed (developer-quality) version of the allocation after determining it? */
id = mca_base_param_reg_int_name("orte", "display_devel_alloc",
"Whether to display a developer-detail allocation after it is determined",
false, false, 0, NULL);
mca_base_param_reg_syn_name(id, "ras", "base_display_devel_alloc", true);
mca_base_param_lookup_int(id, &value);
if (0 != value) {
orte_display_allocation = true;
orte_devel_level_output = true;
}
#endif /* ORTE_DISABLE_FULL_SUPPORT */ #endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS; return ORTE_SUCCESS;

Просмотреть файл

@ -410,10 +410,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
#endif #endif
/* Allocation options */ /* Allocation options */
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0, { "orte", "display", "alloc", '\0', "display-allocation", "display-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display the allocation being used by this job"}, "Display the allocation being used by this job"},
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, { "orte", "display", "devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, "Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
#if OPAL_HAVE_HWLOC #if OPAL_HAVE_HWLOC

Просмотреть файл

@ -96,6 +96,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
(0 == strcmp(node->name, orte_process_info.nodename) && (0 == strcmp(node->name, orte_process_info.nodename) &&
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) { (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
++node->slots; ++node->slots;
/* the dash-host option presumes definition of num_slots */
node->slots_given = true;
break; break;
} }
} }
@ -127,6 +129,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
node->slots_inuse = 0; node->slots_inuse = 0;
node->slots_max = 0; node->slots_max = 0;
node->slots = 1; node->slots = 1;
/* the dash-host option presumes definition of num_slots */
node->slots_given = true;
opal_list_append(nodes, &node->super); opal_list_append(nodes, &node->super);
} }
} }

Просмотреть файл

@ -391,12 +391,14 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
} }
done: done:
if (!got_count) { if (got_count) {
if (got_max) { node->slots_given = true;
node->slots = node->slots_max; } else if (got_max) {
} else { node->slots = node->slots_max;
++node->slots; node->slots_given = true;
} } else {
/* should be set by obj_new, but just to be clear */
node->slots_given = false;
} }
opal_list_append(updates, &node->super); opal_list_append(updates, &node->super);