Merge pull request #1353 from rhc54/topic/host
Per the discussion on the telecon, change the -host behavior yet again
Этот коммит содержится в:
Коммит
a95de6e8ef
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,6 +53,7 @@ ORTE_DECLSPEC int orte_plm_base_select(void);
|
||||
ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data);
|
||||
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
|
||||
|
||||
ORTE_DECLSPEC void orte_plm_base_set_slots(orte_node_t *node);
|
||||
ORTE_DECLSPEC void orte_plm_base_setup_job(int fd, short args, void *cbdata);
|
||||
ORTE_DECLSPEC void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata);
|
||||
ORTE_DECLSPEC void orte_plm_base_complete_setup(int fd, short args, void *cbdata);
|
||||
|
@ -79,6 +79,36 @@
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
|
||||
void orte_plm_base_set_slots(orte_node_t *node)
|
||||
{
|
||||
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_CORE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
|
||||
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_SOCKET, 0,
|
||||
OPAL_HWLOC_LOGICAL))) {
|
||||
/* some systems don't report sockets - in this case,
|
||||
* use numanodes */
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
}
|
||||
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_PU, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else {
|
||||
/* must be a number */
|
||||
node->slots = strtol(orte_set_slots, NULL, 10);
|
||||
}
|
||||
}
|
||||
|
||||
void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
@ -148,33 +178,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:setting slots for node %s by %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
|
||||
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_CORE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
|
||||
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_SOCKET, 0,
|
||||
OPAL_HWLOC_LOGICAL))) {
|
||||
/* some systems don't report sockets - in this case,
|
||||
* use numanodes
|
||||
*/
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
}
|
||||
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
|
||||
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
|
||||
HWLOC_OBJ_PU, 0,
|
||||
OPAL_HWLOC_LOGICAL);
|
||||
} else {
|
||||
/* must be a number */
|
||||
node->slots = strtol(orte_set_slots, NULL, 10);
|
||||
}
|
||||
orte_plm_base_set_slots(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -77,8 +77,8 @@ void orte_ras_base_display_alloc(void)
|
||||
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
|
||||
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
|
||||
} else {
|
||||
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d state=%s\n",
|
||||
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
|
||||
asprintf(&tmp2, "\t%s: flags=0x%02x slots=%d max_slots=%d slots_inuse=%d state=%s\n",
|
||||
(NULL == alloc->name) ? "UNKNOWN" : alloc->name, alloc->flags,
|
||||
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
|
||||
orte_node_state_to_str(alloc->state));
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
|
@ -404,3 +404,9 @@ or provide more node locations in the file.
|
||||
The request to map processes by distance could not be completed
|
||||
because device to map near by was not specified. Please, use
|
||||
rmaps_dist_device mca parameter to set it.
|
||||
#
|
||||
[num-procs-not-specified]
|
||||
Either the -host or -hostfile options were given, but the number
|
||||
of processes to start was omitted. This combination is not supported.
|
||||
|
||||
Please specify the number of processes to run and try again.
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -50,8 +50,9 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
orte_job_map_t *map;
|
||||
orte_node_t *node;
|
||||
int rc, i;
|
||||
bool did_map;
|
||||
bool did_map, given;
|
||||
orte_rmaps_base_selected_module_t *mod;
|
||||
orte_job_t *parent;
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
@ -71,6 +72,47 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
"mca:rmaps: mapping job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* compute the number of procs and check validity */
|
||||
nprocs = 0;
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
if (0 == app->num_procs) {
|
||||
opal_list_t nodes;
|
||||
orte_std_cntr_t slots;
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
|
||||
/* if we are in a managed allocation, then all is good - otherwise,
|
||||
* we have to do a little more checking */
|
||||
if (!orte_managed_allocation) {
|
||||
/* if all the nodes have their slots given, then we are okay */
|
||||
given = true;
|
||||
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
|
||||
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
|
||||
given = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* if -host or -hostfile was given, and the slots were not,
|
||||
* then this is no longer allowed */
|
||||
if (!given &&
|
||||
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
|
||||
/* inform the user of the error */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
OBJ_RELEASE(caddy);
|
||||
OPAL_LIST_DESTRUCT(&nodes);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&nodes);
|
||||
nprocs += slots;
|
||||
} else {
|
||||
nprocs += app->num_procs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
|
||||
* THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
|
||||
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
|
||||
@ -91,22 +133,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
/* compute the number of procs */
|
||||
nprocs = 0;
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
if (0 == app->num_procs) {
|
||||
opal_list_t nodes;
|
||||
orte_std_cntr_t slots;
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
|
||||
OPAL_LIST_DESTRUCT(&nodes);
|
||||
nprocs += slots;
|
||||
} else {
|
||||
nprocs += app->num_procs;
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps: nprocs %s",
|
||||
ORTE_VPID_PRINT(nprocs));
|
||||
@ -142,12 +168,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
}
|
||||
/* check for oversubscribe directives */
|
||||
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
if (orte_managed_allocation) {
|
||||
/* by default, we do not allow oversubscription in managed environments */
|
||||
ORTE_SET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
} else {
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
} else {
|
||||
/* pass along the directive */
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
@ -179,13 +200,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
if (!jdata->map->display_map) {
|
||||
jdata->map->display_map = orte_rmaps_base.display_map;
|
||||
}
|
||||
/* compute the number of procs */
|
||||
nprocs = 0;
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
nprocs += app->num_procs;
|
||||
}
|
||||
}
|
||||
/* set the default mapping policy IFF it wasn't provided */
|
||||
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
|
||||
/* default based on number of procs */
|
||||
@ -215,12 +229,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
}
|
||||
/* check for oversubscribe directives */
|
||||
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
|
||||
if (orte_managed_allocation) {
|
||||
/* by default, we do not allow oversubscription in managed environments */
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
} else {
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
}
|
||||
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
} else {
|
||||
/* pass along the directive */
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
|
@ -345,8 +345,8 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
||||
goto PRINT_PROCS;
|
||||
}
|
||||
|
||||
asprintf(&tmp, "\n%sData for node: %s\tState: %0x",
|
||||
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state);
|
||||
asprintf(&tmp, "\n%sData for node: %s\tState: %0x\tFlags: %02x",
|
||||
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state, src->flags);
|
||||
/* does this node have any aliases? */
|
||||
tmp3 = NULL;
|
||||
if (orte_get_attribute(&src->attributes, ORTE_NODE_ALIAS, (void**)&tmp3, OPAL_STRING)) {
|
||||
|
@ -29,22 +29,22 @@ typedef uint8_t orte_app_context_flags_t;
|
||||
|
||||
|
||||
/* APP_CONTEXT ATTRIBUTE KEYS */
|
||||
#define ORTE_APP_HOSTFILE 1 // string - hostfile
|
||||
#define ORTE_APP_ADD_HOSTFILE 2 // string - hostfile to be added
|
||||
#define ORTE_APP_DASH_HOST 3 // string - hosts specified with -host option
|
||||
#define ORTE_APP_ADD_HOST 4 // string - hosts to be added
|
||||
#define ORTE_APP_USER_CWD 5 // bool - user specified cwd
|
||||
#define ORTE_APP_SSNDIR_CWD 6 // bool - use session dir as cwd
|
||||
#define ORTE_APP_PRELOAD_BIN 7 // bool - move binaries to remote nodes prior to exec
|
||||
#define ORTE_APP_PRELOAD_FILES 8 // string - files to be moved to remote nodes prior to exec
|
||||
#define ORTE_APP_SSTORE_LOAD 9 // string
|
||||
#define ORTE_APP_RECOV_DEF 10 // bool - whether or not a recovery policy was defined
|
||||
#define ORTE_APP_MAX_RESTARTS 11 // int32 - max number of times a process can be restarted
|
||||
#define ORTE_APP_MIN_NODES 12 // int64 - min number of nodes required
|
||||
#define ORTE_APP_MANDATORY 13 // bool - flag if nodes requested in -host are "mandatory" vs "optional"
|
||||
#define ORTE_APP_MAX_PPN 14 // uint32 - maximum number of procs/node for this app
|
||||
#define ORTE_APP_PREFIX_DIR 15 // string - prefix directory for this app, if override necessary
|
||||
#define ORTE_APP_NO_CACHEDIR 16 // bool - flag that a cache dir is not to be specified for a Singularity container
|
||||
#define ORTE_APP_HOSTFILE 1 // string - hostfile
|
||||
#define ORTE_APP_ADD_HOSTFILE 2 // string - hostfile to be added
|
||||
#define ORTE_APP_DASH_HOST 3 // string - hosts specified with -host option
|
||||
#define ORTE_APP_ADD_HOST 4 // string - hosts to be added
|
||||
#define ORTE_APP_USER_CWD 5 // bool - user specified cwd
|
||||
#define ORTE_APP_SSNDIR_CWD 6 // bool - use session dir as cwd
|
||||
#define ORTE_APP_PRELOAD_BIN 7 // bool - move binaries to remote nodes prior to exec
|
||||
#define ORTE_APP_PRELOAD_FILES 8 // string - files to be moved to remote nodes prior to exec
|
||||
#define ORTE_APP_SSTORE_LOAD 9 // string
|
||||
#define ORTE_APP_RECOV_DEF 10 // bool - whether or not a recovery policy was defined
|
||||
#define ORTE_APP_MAX_RESTARTS 11 // int32 - max number of times a process can be restarted
|
||||
#define ORTE_APP_MIN_NODES 12 // int64 - min number of nodes required
|
||||
#define ORTE_APP_MANDATORY 13 // bool - flag if nodes requested in -host are "mandatory" vs "optional"
|
||||
#define ORTE_APP_MAX_PPN 14 // uint32 - maximum number of procs/node for this app
|
||||
#define ORTE_APP_PREFIX_DIR 15 // string - prefix directory for this app, if override necessary
|
||||
#define ORTE_APP_NO_CACHEDIR 16 // bool - flag that a cache dir is not to be specified for a Singularity container
|
||||
|
||||
#define ORTE_APP_MAX_KEY 100
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -249,7 +249,6 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
}
|
||||
} else {
|
||||
node->slots = 1;
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
||||
}
|
||||
opal_list_append(&adds, &node->super);
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user