1
1

If (and only if) a user requests, set the default number of slots on any node to the number of objects of the specified type. This *only* takes effect in an unmanaged environment - i.e., if an external resource manager assigns us a number of slots, then that is what we use. However, if we are using a hostfile, then the user may or may not have given us a value for the number of slots on each node.

For those nodes (and *only* those nodes) where the user does *not* specify a slot count, we will set the number of slots according to their direction: either to the number of cores, numas, sockets, or hwthreads. Otherwise, the slot count is set to 1.

Note that the default behavior remains unchanged: in the absence of any value for #slots, and in the absence of any directive to set #slots, we will set #slots=1.

This commit was SVN r27236.
Этот коммит содержится в:
Ralph Castain 2012-09-04 20:58:26 +00:00
родитель ee6c7702d2
Коммит bae5dab916
10 изменённых файлов: 114 добавлений и 37 удалений

Просмотреть файл

@ -107,9 +107,64 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
}
}
}
/* if this is an unmanaged allocation, then set the default
* slots on each node as directed or using default
*/
if (!orte_managed_allocation) {
if (NULL != orte_set_slots) {
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (!node->slots_given) {
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_CORE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_SOCKET, 0,
OPAL_HWLOC_LOGICAL))) {
/* some systems don't report sockets - in this case,
* use numanodes
*/
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
}
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_PU, 0,
OPAL_HWLOC_LOGICAL);
} else {
/* must be a number */
node->slots = strtol(orte_set_slots, NULL, 10);
}
}
}
} else {
/* set any non-specified slot counts to 1 */
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
if (!node->slots_given) {
node->slots = 1;
}
}
}
}
}
#endif
if (orte_display_allocation) {
orte_ras_base_display_alloc();
}
/* progress the job */
caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_VM_READY);

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -45,7 +45,6 @@ typedef struct orte_ras_base_t {
int ras_output;
opal_list_t ras_opened;
bool allocation_read;
bool display_alloc;
orte_ras_base_module_t *active_module;
int total_slots_alloc;
} orte_ras_base_t;
@ -59,6 +58,8 @@ ORTE_DECLSPEC int orte_ras_base_select(void);
ORTE_DECLSPEC int orte_ras_base_finalize(void);
ORTE_DECLSPEC int orte_ras_base_close(void);
ORTE_DECLSPEC void orte_ras_base_display_alloc(void);
ORTE_DECLSPEC void orte_ras_base_allocate(int fd, short args, void *cbdata);
ORTE_DECLSPEC int orte_ras_base_add_hosts(orte_job_t *jdata);

Просмотреть файл

@ -48,8 +48,8 @@
#include "orte/mca/ras/base/ras_private.h"
/* static function to display allocation */
static void display_alloc(void)
/* function to display allocation */
void orte_ras_base_display_alloc(void)
{
char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL;
int i, istart;
@ -367,8 +367,8 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
DISPLAY:
/* shall we display the results? */
if (4 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) {
display_alloc();
if (4 < opal_output_get_verbosity(orte_ras_base.ras_output)) {
orte_ras_base_display_alloc();
}
next_state:
@ -470,8 +470,8 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
}
/* shall we display the results? */
if (0 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) {
display_alloc();
if (0 < opal_output_get_verbosity(orte_ras_base.ras_output)) {
orte_ras_base_display_alloc();
}
return ORTE_SUCCESS;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -72,30 +72,11 @@ orte_ras_base_t orte_ras_base;
*/
int orte_ras_base_open(void)
{
int value;
bool btmp;
/* set default flags */
orte_ras_base.active_module = NULL;
orte_ras_base.allocation_read = false;
orte_ras_base.total_slots_alloc = 0;
/* should we display the allocation after determining it? */
mca_base_param_reg_int_name("ras", "base_display_alloc",
"Whether to display the allocation after it is determined",
false, false, (int)false, &value);
orte_ras_base.display_alloc = OPAL_INT_TO_BOOL(value);
/* should we display a detailed (developer-quality) version of the allocation after determining it? */
mca_base_param_reg_int_name("ras", "base_display_devel_alloc",
"Whether to display a developer-detail allocation after it is determined",
false, false, (int)false, &value);
btmp = OPAL_INT_TO_BOOL(value);
if (btmp) {
orte_ras_base.display_alloc = true;
orte_devel_level_output = true;
}
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_ras_base.ras_output = opal_output_open(NULL);

Просмотреть файл

@ -95,6 +95,9 @@ char **orte_launch_environ;
bool orte_hnp_is_allocated = false;
bool orte_allocation_required;
bool orte_managed_allocation = false;
char *orte_set_slots = NULL;
bool orte_display_allocation;
bool orte_display_devel_allocation;
/* launch agents */
char *orte_launch_agent = NULL;
@ -826,6 +829,7 @@ static void orte_node_construct(orte_node_t* node)
node->oversubscribed = false;
node->state = ORTE_NODE_STATE_UNKNOWN;
node->slots = 0;
node->slots_given = false;
node->slots_inuse = 0;
node->slots_alloc = 0;
node->slots_max = 0;

Просмотреть файл

@ -333,6 +333,10 @@ typedef struct {
that we have been allocated on this note and would be the
"ideal" number of processes for us to launch. */
orte_std_cntr_t slots;
/* a flag indicating that the number of slots was specified - used
* only in non-managed environments
*/
bool slots_given;
/** How many processes have already been launched, used by one or
more jobs on this node. */
orte_std_cntr_t slots_inuse;
@ -610,6 +614,9 @@ ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
ORTE_DECLSPEC extern bool orte_allocation_required;
ORTE_DECLSPEC extern bool orte_managed_allocation;
ORTE_DECLSPEC extern char *orte_set_slots;
ORTE_DECLSPEC extern bool orte_display_allocation;
ORTE_DECLSPEC extern bool orte_display_devel_allocation;
/* launch agents */
ORTE_DECLSPEC extern char *orte_launch_agent;

Просмотреть файл

@ -44,7 +44,7 @@ static bool passed_thru = false;
int orte_register_params(void)
{
int value;
int value, id;
char *strval, *strval1, *strval2;
/* only go thru this once - mpirun calls it twice, which causes
@ -553,6 +553,29 @@ int orte_register_params(void)
orte_use_common_port = false;
#endif
mca_base_param_reg_string_name("orte", "set_default_slots",
"Set the number of slots on nodes that lack such info to the number of specified objects [a number, \"cores\", \"numas\", \"sockets\", or \"hwthreads\"]",
false, false, NULL, &orte_set_slots);
/* should we display the allocation after determining it? */
id = mca_base_param_reg_int_name("orte", "display_alloc",
"Whether to display the allocation after it is determined",
false, false, (int)false, NULL);
mca_base_param_reg_syn_name(id, "ras", "base_display_alloc", true);
mca_base_param_lookup_int(id, &value);
orte_display_allocation = OPAL_INT_TO_BOOL(value);
/* should we display a detailed (developer-quality) version of the allocation after determining it? */
id = mca_base_param_reg_int_name("orte", "display_devel_alloc",
"Whether to display a developer-detail allocation after it is determined",
false, false, 0, NULL);
mca_base_param_reg_syn_name(id, "ras", "base_display_devel_alloc", true);
mca_base_param_lookup_int(id, &value);
if (0 != value) {
orte_display_allocation = true;
orte_devel_level_output = true;
}
#endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS;

Просмотреть файл

@ -410,10 +410,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
#endif
/* Allocation options */
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
{ "orte", "display", "alloc", '\0', "display-allocation", "display-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display the allocation being used by this job"},
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
{ "orte", "display", "devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
#if OPAL_HAVE_HWLOC

Просмотреть файл

@ -96,6 +96,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
(0 == strcmp(node->name, orte_process_info.nodename) &&
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
++node->slots;
/* the dash-host option presumes definition of num_slots */
node->slots_given = true;
break;
}
}
@ -127,6 +129,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
node->slots_inuse = 0;
node->slots_max = 0;
node->slots = 1;
/* the dash-host option presumes definition of num_slots */
node->slots_given = true;
opal_list_append(nodes, &node->super);
}
}

Просмотреть файл

@ -391,12 +391,14 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
}
done:
if (!got_count) {
if (got_max) {
node->slots = node->slots_max;
} else {
++node->slots;
}
if (got_count) {
node->slots_given = true;
} else if (got_max) {
node->slots = node->slots_max;
node->slots_given = true;
} else {
/* should be set by obj_new, but just to be clear */
node->slots_given = false;
}
opal_list_append(updates, &node->super);