1
1

Fix -H operations for multi-app case

Correctly aggregate slots across -H entries from each app. Take into
account any -H entry when computing nprocs when no value was given.

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2018-10-11 09:30:01 -07:00
родитель 5f767e1a54
Коммит f5a6b7f1e9
4 изменённых файлов: 46 добавлений и 35 удалений

Просмотреть файл

@ -49,7 +49,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
orte_job_t *jdata;
orte_node_t *node;
int rc, i, ppx = 0;
bool did_map, given, pernode = false, persocket = false;
bool did_map, pernode = false, persocket = false;
orte_rmaps_base_selected_module_t *mod;
orte_job_t *parent;
orte_vpid_t nprocs;
@ -105,7 +105,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
orte_std_cntr_t slots;
OBJ_CONSTRUCT(&nodes, opal_list_t);
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
slots = 0;
if (pernode) {
slots = ppx * opal_list_get_size(&nodes);
} else if (persocket) {
@ -115,34 +114,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
HWLOC_OBJ_SOCKET, 0,
OPAL_HWLOC_AVAILABLE);
}
} else {
/* if we are in a managed allocation, then all is good - otherwise,
* we have to do a little more checking */
if (!orte_managed_allocation) {
/* if all the nodes have their slots given, then we are okay */
given = true;
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
given = false;
break;
}
}
/* if -host or -hostfile was given, and the slots were not,
* then this is no longer allowed */
if (!given &&
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
/* inform the user of the error */
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
OPAL_LIST_DESTRUCT(&nodes);
OBJ_RELEASE(caddy);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
return;
}
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
slots += node->slots;
}
}
}
app->num_procs = slots;
OPAL_LIST_DESTRUCT(&nodes);

Просмотреть файл

@ -469,12 +469,19 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
continue;
}
if (node->slots > node->slots_inuse) {
orte_std_cntr_t s;
/* check for any -host allocations */
if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
s = orte_util_dash_host_compute_slots(node, hosts);
} else {
s = node->slots - node->slots_inuse;
}
/* add the available slots */
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
"%s node %s has %d slots available",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
node->name, node->slots - node->slots_inuse));
num_slots += node->slots - node->slots_inuse;
node->name, s));
num_slots += s;
continue;
}
if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
@ -42,6 +42,36 @@
#include "dash_host.h"
int orte_util_dash_host_compute_slots(orte_node_t *node, char *hosts)
{
char **specs, *cptr;
int slots = 0;
int n;
specs = opal_argv_split(hosts, ',');
/* see if this node appears in the list */
for (n=0; NULL != specs[n]; n++) {
if (0 == strncmp(node->name, specs[n], strlen(node->name))) {
/* check if the #slots was specified */
if (NULL != (cptr = strchr(specs[n], ':'))) {
*cptr = '\0';
++cptr;
if ('*' == *cptr || 0 == strcmp(cptr, "auto")) {
slots += node->slots - node->slots_inuse;
} else {
slots += strtol(cptr, NULL, 10);
}
} else {
++slots;
}
}
}
opal_argv_free(specs);
return slots;
}
/* we can only enter this routine if no other allocation
* was found, so we only need to know that finding any
* relative node syntax should generate an immediate error
@ -289,7 +319,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
if (ORTE_FLAG_TEST(nd, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
/* transfer across the number of slots */
node->slots = nd->slots;
node->slots += nd->slots;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
}
break;

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,6 +27,7 @@
#include "opal/class/opal_list.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS
@ -41,6 +42,8 @@ ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
ORTE_DECLSPEC int orte_util_get_ordered_dash_host_list(opal_list_t *nodes,
char *hosts);
ORTE_DECLSPEC int orte_util_dash_host_compute_slots(orte_node_t *node, char *hosts);
END_C_DECLS
#endif