1
1

Merge pull request #1353 from rhc54/topic/host

Per the discussion on the telecon, change the -host behavior yet again
Этот коммит содержится в:
rhc54 2016-04-04 10:30:36 -07:00
родитель 74293bc235 503e1274a9
Коммит a95de6e8ef
9 изменённых файлов: 108 добавлений и 89 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,6 +53,7 @@ ORTE_DECLSPEC int orte_plm_base_select(void);
ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
ORTE_DECLSPEC void orte_plm_base_set_slots(orte_node_t *node);
ORTE_DECLSPEC void orte_plm_base_setup_job(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata);
ORTE_DECLSPEC void orte_plm_base_complete_setup(int fd, short args, void *cbdata);

Просмотреть файл

@ -79,6 +79,36 @@
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/base.h"
void orte_plm_base_set_slots(orte_node_t *node)
{
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_CORE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_SOCKET, 0,
OPAL_HWLOC_LOGICAL))) {
/* some systems don't report sockets - in this case,
* use numanodes */
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
}
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_PU, 0,
OPAL_HWLOC_LOGICAL);
} else {
/* must be a number */
node->slots = strtol(orte_set_slots, NULL, 10);
}
}
void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@ -148,33 +178,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:setting slots for node %s by %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_CORE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_SOCKET, 0,
OPAL_HWLOC_LOGICAL))) {
/* some systems don't report sockets - in this case,
* use numanodes
*/
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
}
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_LOGICAL);
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
HWLOC_OBJ_PU, 0,
OPAL_HWLOC_LOGICAL);
} else {
/* must be a number */
node->slots = strtol(orte_set_slots, NULL, 10);
}
orte_plm_base_set_slots(node);
}
}
}

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -77,8 +77,8 @@ void orte_ras_base_display_alloc(void)
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
} else {
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d state=%s\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
asprintf(&tmp2, "\t%s: flags=0x%02x slots=%d max_slots=%d slots_inuse=%d state=%s\n",
(NULL == alloc->name) ? "UNKNOWN" : alloc->name, alloc->flags,
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
orte_node_state_to_str(alloc->state));
}

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$

Просмотреть файл

@ -404,3 +404,9 @@ or provide more node locations in the file.
The request to map processes by distance could not be completed
because device to map near by was not specified. Please, use
rmaps_dist_device mca parameter to set it.
#
[num-procs-not-specified]
Either the -host or -hostfile options were given, but the number
of processes to start was omitted. This combination is not supported.
Please specify the number of processes to run and try again.

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -50,8 +50,9 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
{
orte_job_t *jdata;
orte_job_map_t *map;
orte_node_t *node;
int rc, i;
bool did_map;
bool did_map, given;
orte_rmaps_base_selected_module_t *mod;
orte_job_t *parent;
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@ -71,6 +72,47 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
"mca:rmaps: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* compute the number of procs and check validity */
nprocs = 0;
for (i=0; i < jdata->apps->size; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
if (0 == app->num_procs) {
opal_list_t nodes;
orte_std_cntr_t slots;
OBJ_CONSTRUCT(&nodes, opal_list_t);
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
/* if we are in a managed allocation, then all is good - otherwise,
* we have to do a little more checking */
if (!orte_managed_allocation) {
/* if all the nodes have their slots given, then we are okay */
given = true;
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
given = false;
break;
}
}
/* if -host or -hostfile was given, and the slots were not,
* then this is no longer allowed */
if (!given &&
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
/* inform the user of the error */
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
OBJ_RELEASE(caddy);
OPAL_LIST_DESTRUCT(&nodes);
return;
}
}
OPAL_LIST_DESTRUCT(&nodes);
nprocs += slots;
} else {
nprocs += app->num_procs;
}
}
}
/* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
* THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
@ -91,22 +133,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
OBJ_RELEASE(caddy);
return;
}
/* compute the number of procs */
nprocs = 0;
for (i=0; i < jdata->apps->size; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
if (0 == app->num_procs) {
opal_list_t nodes;
orte_std_cntr_t slots;
OBJ_CONSTRUCT(&nodes, opal_list_t);
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
OPAL_LIST_DESTRUCT(&nodes);
nprocs += slots;
} else {
nprocs += app->num_procs;
}
}
}
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: nprocs %s",
ORTE_VPID_PRINT(nprocs));
@ -142,12 +168,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
}
/* check for oversubscribe directives */
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
if (orte_managed_allocation) {
/* by default, we do not allow oversubscription in managed environments */
ORTE_SET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
} else {
ORTE_UNSET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
}
} else {
/* pass along the directive */
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
@ -179,13 +200,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
if (!jdata->map->display_map) {
jdata->map->display_map = orte_rmaps_base.display_map;
}
/* compute the number of procs */
nprocs = 0;
for (i=0; i < jdata->apps->size; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
nprocs += app->num_procs;
}
}
/* set the default mapping policy IFF it wasn't provided */
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
/* default based on number of procs */
@ -215,12 +229,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
}
/* check for oversubscribe directives */
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
if (orte_managed_allocation) {
/* by default, we do not allow oversubscription in managed environments */
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
} else {
ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
}
} else {
/* pass along the directive */
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {

Просмотреть файл

@ -345,8 +345,8 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
goto PRINT_PROCS;
}
asprintf(&tmp, "\n%sData for node: %s\tState: %0x",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state);
asprintf(&tmp, "\n%sData for node: %s\tState: %0x\tFlags: %02x",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state, src->flags);
/* does this node have any aliases? */
tmp3 = NULL;
if (orte_get_attribute(&src->attributes, ORTE_NODE_ALIAS, (void**)&tmp3, OPAL_STRING)) {

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -249,7 +249,6 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
}
} else {
node->slots = 1;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
}
opal_list_append(&adds, &node->super);
}