If (and only if) a user requests, set the default number of slots on any node to the number of objects of the specified type. This *only* takes effect in an unmanaged environment - i.e., if an external resource manager assigns us a number of slots, then that is what we use. However, if we are using a hostfile, then the user may or may not have given us a value for the number of slots on each node.
For those nodes (and *only* those nodes) where the user does *not* specify a slot count, we will set the number of slots according to their direction: either to the number of cores, numas, sockets, or hwthreads. Otherwise, the slot count is set to 1. Note that the default behavior remains unchanged: in the absence of any value for #slots, and in the absence of any directive to set #slots, we will set #slots=1. This commit was SVN r27236.
Этот коммит содержится в:
родитель
ee6c7702d2
Коммит
bae5dab916
@ -107,9 +107,64 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if this is an unmanaged allocation, then set the default
|
||||
* slots on each node as directed or using default
|
||||
*/
|
||||
if (!orte_managed_allocation) {
|
||||
if (NULL != orte_set_slots) {
|
||||
for (i=0; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!node->slots_given) {
|
||||
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_CORE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
|
||||
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_SOCKET, 0,
|
||||
OPAL_HWLOC_LOGICAL))) {
|
||||
/* some systems don't report sockets - in this case,
|
||||
* use numanodes
|
||||
*/
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
}
|
||||
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_PU, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else {
|
||||
/* must be a number */
|
||||
node->slots = strtol(orte_set_slots, NULL, 10);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* set any non-specified slot counts to 1 */
|
||||
for (i=0; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
}
|
||||
if (!node->slots_given) {
|
||||
node->slots = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (orte_display_allocation) {
|
||||
orte_ras_base_display_alloc();
|
||||
}
|
||||
|
||||
/* progress the job */
|
||||
caddy->jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_VM_READY);
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -45,7 +45,6 @@ typedef struct orte_ras_base_t {
|
||||
int ras_output;
|
||||
opal_list_t ras_opened;
|
||||
bool allocation_read;
|
||||
bool display_alloc;
|
||||
orte_ras_base_module_t *active_module;
|
||||
int total_slots_alloc;
|
||||
} orte_ras_base_t;
|
||||
@ -59,6 +58,8 @@ ORTE_DECLSPEC int orte_ras_base_select(void);
|
||||
ORTE_DECLSPEC int orte_ras_base_finalize(void);
|
||||
ORTE_DECLSPEC int orte_ras_base_close(void);
|
||||
|
||||
ORTE_DECLSPEC void orte_ras_base_display_alloc(void);
|
||||
|
||||
ORTE_DECLSPEC void orte_ras_base_allocate(int fd, short args, void *cbdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_ras_base_add_hosts(orte_job_t *jdata);
|
||||
|
@ -48,8 +48,8 @@
|
||||
|
||||
#include "orte/mca/ras/base/ras_private.h"
|
||||
|
||||
/* static function to display allocation */
|
||||
static void display_alloc(void)
|
||||
/* function to display allocation */
|
||||
void orte_ras_base_display_alloc(void)
|
||||
{
|
||||
char *tmp=NULL, *tmp2, *tmp3, *pfx=NULL;
|
||||
int i, istart;
|
||||
@ -367,8 +367,8 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
|
||||
|
||||
DISPLAY:
|
||||
/* shall we display the results? */
|
||||
if (4 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) {
|
||||
display_alloc();
|
||||
if (4 < opal_output_get_verbosity(orte_ras_base.ras_output)) {
|
||||
orte_ras_base_display_alloc();
|
||||
}
|
||||
|
||||
next_state:
|
||||
@ -470,8 +470,8 @@ int orte_ras_base_add_hosts(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* shall we display the results? */
|
||||
if (0 < opal_output_get_verbosity(orte_ras_base.ras_output) || orte_ras_base.display_alloc) {
|
||||
display_alloc();
|
||||
if (0 < opal_output_get_verbosity(orte_ras_base.ras_output)) {
|
||||
orte_ras_base_display_alloc();
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -72,30 +72,11 @@ orte_ras_base_t orte_ras_base;
|
||||
*/
|
||||
int orte_ras_base_open(void)
|
||||
{
|
||||
int value;
|
||||
bool btmp;
|
||||
|
||||
/* set default flags */
|
||||
orte_ras_base.active_module = NULL;
|
||||
orte_ras_base.allocation_read = false;
|
||||
orte_ras_base.total_slots_alloc = 0;
|
||||
|
||||
/* should we display the allocation after determining it? */
|
||||
mca_base_param_reg_int_name("ras", "base_display_alloc",
|
||||
"Whether to display the allocation after it is determined",
|
||||
false, false, (int)false, &value);
|
||||
orte_ras_base.display_alloc = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* should we display a detailed (developer-quality) version of the allocation after determining it? */
|
||||
mca_base_param_reg_int_name("ras", "base_display_devel_alloc",
|
||||
"Whether to display a developer-detail allocation after it is determined",
|
||||
false, false, (int)false, &value);
|
||||
btmp = OPAL_INT_TO_BOOL(value);
|
||||
if (btmp) {
|
||||
orte_ras_base.display_alloc = true;
|
||||
orte_devel_level_output = true;
|
||||
}
|
||||
|
||||
/* Debugging / verbose output. Always have stream open, with
|
||||
verbose set by the mca open system... */
|
||||
orte_ras_base.ras_output = opal_output_open(NULL);
|
||||
|
@ -95,6 +95,9 @@ char **orte_launch_environ;
|
||||
bool orte_hnp_is_allocated = false;
|
||||
bool orte_allocation_required;
|
||||
bool orte_managed_allocation = false;
|
||||
char *orte_set_slots = NULL;
|
||||
bool orte_display_allocation;
|
||||
bool orte_display_devel_allocation;
|
||||
|
||||
/* launch agents */
|
||||
char *orte_launch_agent = NULL;
|
||||
@ -826,6 +829,7 @@ static void orte_node_construct(orte_node_t* node)
|
||||
node->oversubscribed = false;
|
||||
node->state = ORTE_NODE_STATE_UNKNOWN;
|
||||
node->slots = 0;
|
||||
node->slots_given = false;
|
||||
node->slots_inuse = 0;
|
||||
node->slots_alloc = 0;
|
||||
node->slots_max = 0;
|
||||
|
@ -333,6 +333,10 @@ typedef struct {
|
||||
that we have been allocated on this note and would be the
|
||||
"ideal" number of processes for us to launch. */
|
||||
orte_std_cntr_t slots;
|
||||
/* a flag indicating that the number of slots was specified - used
|
||||
* only in non-managed environments
|
||||
*/
|
||||
bool slots_given;
|
||||
/** How many processes have already been launched, used by one or
|
||||
more jobs on this node. */
|
||||
orte_std_cntr_t slots_inuse;
|
||||
@ -610,6 +614,9 @@ ORTE_DECLSPEC extern char **orte_launch_environ;
|
||||
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
|
||||
ORTE_DECLSPEC extern bool orte_allocation_required;
|
||||
ORTE_DECLSPEC extern bool orte_managed_allocation;
|
||||
ORTE_DECLSPEC extern char *orte_set_slots;
|
||||
ORTE_DECLSPEC extern bool orte_display_allocation;
|
||||
ORTE_DECLSPEC extern bool orte_display_devel_allocation;
|
||||
|
||||
/* launch agents */
|
||||
ORTE_DECLSPEC extern char *orte_launch_agent;
|
||||
|
@ -44,7 +44,7 @@ static bool passed_thru = false;
|
||||
|
||||
int orte_register_params(void)
|
||||
{
|
||||
int value;
|
||||
int value, id;
|
||||
char *strval, *strval1, *strval2;
|
||||
|
||||
/* only go thru this once - mpirun calls it twice, which causes
|
||||
@ -553,6 +553,29 @@ int orte_register_params(void)
|
||||
orte_use_common_port = false;
|
||||
#endif
|
||||
|
||||
mca_base_param_reg_string_name("orte", "set_default_slots",
|
||||
"Set the number of slots on nodes that lack such info to the number of specified objects [a number, \"cores\", \"numas\", \"sockets\", or \"hwthreads\"]",
|
||||
false, false, NULL, &orte_set_slots);
|
||||
|
||||
/* should we display the allocation after determining it? */
|
||||
id = mca_base_param_reg_int_name("orte", "display_alloc",
|
||||
"Whether to display the allocation after it is determined",
|
||||
false, false, (int)false, NULL);
|
||||
mca_base_param_reg_syn_name(id, "ras", "base_display_alloc", true);
|
||||
mca_base_param_lookup_int(id, &value);
|
||||
orte_display_allocation = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* should we display a detailed (developer-quality) version of the allocation after determining it? */
|
||||
id = mca_base_param_reg_int_name("orte", "display_devel_alloc",
|
||||
"Whether to display a developer-detail allocation after it is determined",
|
||||
false, false, 0, NULL);
|
||||
mca_base_param_reg_syn_name(id, "ras", "base_display_devel_alloc", true);
|
||||
mca_base_param_lookup_int(id, &value);
|
||||
if (0 != value) {
|
||||
orte_display_allocation = true;
|
||||
orte_devel_level_output = true;
|
||||
}
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -410,10 +410,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
#endif
|
||||
|
||||
/* Allocation options */
|
||||
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
|
||||
{ "orte", "display", "alloc", '\0', "display-allocation", "display-allocation", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display the allocation being used by this job"},
|
||||
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
|
||||
{ "orte", "display", "devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
|
||||
#if OPAL_HAVE_HWLOC
|
||||
|
@ -96,6 +96,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
(0 == strcmp(node->name, orte_process_info.nodename) &&
|
||||
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
|
||||
++node->slots;
|
||||
/* the dash-host option presumes definition of num_slots */
|
||||
node->slots_given = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -127,6 +129,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
/* the dash-host option presumes definition of num_slots */
|
||||
node->slots_given = true;
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
}
|
||||
|
@ -391,12 +391,14 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
|
||||
}
|
||||
|
||||
done:
|
||||
if (!got_count) {
|
||||
if (got_max) {
|
||||
node->slots = node->slots_max;
|
||||
} else {
|
||||
++node->slots;
|
||||
}
|
||||
if (got_count) {
|
||||
node->slots_given = true;
|
||||
} else if (got_max) {
|
||||
node->slots = node->slots_max;
|
||||
node->slots_given = true;
|
||||
} else {
|
||||
/* should be set by obj_new, but just to be clear */
|
||||
node->slots_given = false;
|
||||
}
|
||||
opal_list_append(updates, &node->super);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user