1
1

Merge pull request #1353 from rhc54/topic/host

Per the discussion on the telecon, change the -host behavior yet again
Этот коммит содержится в:
rhc54 2016-04-04 10:30:36 -07:00
родитель 74293bc235 503e1274a9
Коммит a95de6e8ef
9 изменённых файлов: 108 добавлений и 89 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,6 +53,7 @@ ORTE_DECLSPEC int orte_plm_base_select(void);
ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_set_slots(orte_node_t *node);
ORTE_DECLSPEC void orte_plm_base_setup_job(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_complete_setup(int fd, short args, void *cbdata);

Просмотреть файл

@ -79,6 +79,36 @@
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/base.h"
void orte_plm_base_set_slots(orte_node_t *node)
{
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_CORE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_SOCKET, 0,
OPAL_HWLOC_LOGICAL))) {
/* some systems don't report sockets - in this case,
* use numanodes */
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
}
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_PU, 0,
OPAL_HWLOC_LOGICAL);
} else {
/* must be a number */
node->slots = strtol(orte_set_slots, NULL, 10);
}
}
void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@ -148,33 +178,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setting slots for node %s by %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_CORE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_SOCKET, 0,
OPAL_HWLOC_LOGICAL))) {
/* some systems don't report sockets - in this case,
* use numanodes
*/
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
}
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_PU, 0,
OPAL_HWLOC_LOGICAL);
} else {
/* must be a number */
node->slots = strtol(orte_set_slots, NULL, 10);
}
orte_plm_base_set_slots(node);
}
}
}

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -77,8 +77,8 @@ void orte_ras_base_display_alloc(void)
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
} else {
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d state=%s\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
asprintf(&tmp2, "\t%s: flags=0x%02x slots=%d max_slots=%d slots_inuse=%d state=%s\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name, alloc->flags,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
orte_node_state_to_str(alloc->state));
}

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$

Просмотреть файл

@ -404,3 +404,9 @@ or provide more node locations in the file.
The request to map processes by distance could not be completed
because device to map near by was not specified. Please, use
rmaps_dist_device mca parameter to set it.
#
[num-procs-not-specified]
Either the -host or -hostfile options were given, but the number
of processes to start was omitted. This combination is not supported.
Please specify the number of processes to run and try again.

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -50,8 +50,9 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
{
orte_job_t *jdata;
orte_job_map_t *map;
orte_node_t *node;
int rc, i;
bool did_map;
bool did_map, given;
orte_rmaps_base_selected_module_t *mod;
orte_job_t *parent;
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@ -71,6 +72,47 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
"mca:rmaps: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* compute the number of procs and check validity */
nprocs = 0;
for (i=0; i < jdata->apps->size; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
if (0 == app->num_procs) {
opal_list_t nodes;
orte_std_cntr_t slots;
OBJ_CONSTRUCT(&nodes, opal_list_t);
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
/* if we are in a managed allocation, then all is good - otherwise,
* we have to do a little more checking */
if (!orte_managed_allocation) {
/* if all the nodes have their slots given, then we are okay */
given = true;
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
given = false;
break;
}
}
/* if -host or -hostfile was given, and the slots were not,
* then this is no longer allowed */
if (!given &&
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
/* inform the user of the error */
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
OBJ_RELEASE(caddy);
OPAL_LIST_DESTRUCT(&nodes);
return;
}
}
OPAL_LIST_DESTRUCT(&nodes);
nprocs += slots;
} else {
nprocs += app->num_procs;
}
}
}
/* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
* THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
@ -91,22 +133,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
OBJ_RELEASE(caddy);
return;
}
/* compute the number of procs */
nprocs = 0;
for (i=0; i < jdata->apps->size; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
if (0 == app->num_procs) {
opal_list_t nodes;
orte_std_cntr_t slots;
OBJ_CONSTRUCT(&nodes, opal_list_t);
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
OPAL_LIST_DESTRUCT(&nodes);
nprocs += slots;
} else {
nprocs += app->num_procs;
}
}
}
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: nprocs %s",
ORTE_VPID_PRINT(nprocs));
@ -142,12 +168,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
}
/* check for oversubscribe directives */
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
if (orte_managed_allocation) {
/* by default, we do not allow oversubscription in managed environments */
ORTE_SET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
} else {
ORTE_UNSET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
}
ORTE_SET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
} else {
/* pass along the directive */
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
@ -179,13 +200,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
if (!jdata->map->display_map) {
jdata->map->display_map = orte_rmaps_base.display_map;
}
/* compute the number of procs */
nprocs = 0;
for (i=0; i < jdata->apps->size; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
nprocs += app->num_procs;
}
}
/* set the default mapping policy IFF it wasn't provided */
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
/* default based on number of procs */
@ -215,12 +229,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
}
/* check for oversubscribe directives */
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
if (orte_managed_allocation) {
/* by default, we do not allow oversubscription in managed environments */
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
} else {
ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
}
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
} else {
/* pass along the directive */
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {

Просмотреть файл

@ -345,8 +345,8 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
goto PRINT_PROCS;
}
asprintf(&tmp, "\n%sData for node: %s\tState: %0x",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state);
asprintf(&tmp, "\n%sData for node: %s\tState: %0x\tFlags: %02x",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state, src->flags);
/* does this node have any aliases? */
tmp3 = NULL;
if (orte_get_attribute(&src->attributes, ORTE_NODE_ALIAS, (void**)&tmp3, OPAL_STRING)) {

Просмотреть файл

@ -29,22 +29,22 @@ typedef uint8_t orte_app_context_flags_t;
/* APP_CONTEXT ATTRIBUTE KEYS */
#define ORTE_APP_HOSTFILE 1 // string - hostfile
#define ORTE_APP_ADD_HOSTFILE 2 // string - hostfile to be added
#define ORTE_APP_DASH_HOST 3 // string - hosts specified with -host option
#define ORTE_APP_ADD_HOST 4 // string - hosts to be added
#define ORTE_APP_USER_CWD 5 // bool - user specified cwd
#define ORTE_APP_SSNDIR_CWD 6 // bool - use session dir as cwd
#define ORTE_APP_PRELOAD_BIN 7 // bool - move binaries to remote nodes prior to exec
#define ORTE_APP_PRELOAD_FILES 8 // string - files to be moved to remote nodes prior to exec
#define ORTE_APP_SSTORE_LOAD 9 // string
#define ORTE_APP_RECOV_DEF 10 // bool - whether or not a recovery policy was defined
#define ORTE_APP_MAX_RESTARTS 11 // int32 - max number of times a process can be restarted
#define ORTE_APP_MIN_NODES 12 // int64 - min number of nodes required
#define ORTE_APP_MANDATORY 13 // bool - flag if nodes requested in -host are "mandatory" vs "optional"
#define ORTE_APP_MAX_PPN 14 // uint32 - maximum number of procs/node for this app
#define ORTE_APP_PREFIX_DIR 15 // string - prefix directory for this app, if override necessary
#define ORTE_APP_NO_CACHEDIR 16 // bool - flag that a cache dir is not to be specified for a Singularity container
#define ORTE_APP_HOSTFILE 1 // string - hostfile
#define ORTE_APP_ADD_HOSTFILE 2 // string - hostfile to be added
#define ORTE_APP_DASH_HOST 3 // string - hosts specified with -host option
#define ORTE_APP_ADD_HOST 4 // string - hosts to be added
#define ORTE_APP_USER_CWD 5 // bool - user specified cwd
#define ORTE_APP_SSNDIR_CWD 6 // bool - use session dir as cwd
#define ORTE_APP_PRELOAD_BIN 7 // bool - move binaries to remote nodes prior to exec
#define ORTE_APP_PRELOAD_FILES 8 // string - files to be moved to remote nodes prior to exec
#define ORTE_APP_SSTORE_LOAD 9 // string
#define ORTE_APP_RECOV_DEF 10 // bool - whether or not a recovery policy was defined
#define ORTE_APP_MAX_RESTARTS 11 // int32 - max number of times a process can be restarted
#define ORTE_APP_MIN_NODES 12 // int64 - min number of nodes required
#define ORTE_APP_MANDATORY 13 // bool - flag if nodes requested in -host are "mandatory" vs "optional"
#define ORTE_APP_MAX_PPN 14 // uint32 - maximum number of procs/node for this app
#define ORTE_APP_PREFIX_DIR 15 // string - prefix directory for this app, if override necessary
#define ORTE_APP_NO_CACHEDIR 16 // bool - flag that a cache dir is not to be specified for a Singularity container
#define ORTE_APP_MAX_KEY 100

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -249,7 +249,6 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
}
} else {
node->slots = 1;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
}
opal_list_append(&adds, &node->super);
}