Fix -H operations for multi-app case
Correctly aggregate slots across -H entries from each app. Take into account any -H entry when computing nprocs when no value was given. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
5f767e1a54
Коммит
f5a6b7f1e9
@ -49,7 +49,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
orte_job_t *jdata;
|
||||
orte_node_t *node;
|
||||
int rc, i, ppx = 0;
|
||||
bool did_map, given, pernode = false, persocket = false;
|
||||
bool did_map, pernode = false, persocket = false;
|
||||
orte_rmaps_base_selected_module_t *mod;
|
||||
orte_job_t *parent;
|
||||
orte_vpid_t nprocs;
|
||||
@ -105,7 +105,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
orte_std_cntr_t slots;
|
||||
OBJ_CONSTRUCT(&nodes, opal_list_t);
|
||||
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
|
||||
slots = 0;
|
||||
if (pernode) {
|
||||
slots = ppx * opal_list_get_size(&nodes);
|
||||
} else if (persocket) {
|
||||
@ -115,34 +114,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
HWLOC_OBJ_SOCKET, 0,
|
||||
OPAL_HWLOC_AVAILABLE);
|
||||
}
|
||||
} else {
|
||||
/* if we are in a managed allocation, then all is good - otherwise,
|
||||
* we have to do a little more checking */
|
||||
if (!orte_managed_allocation) {
|
||||
/* if all the nodes have their slots given, then we are okay */
|
||||
given = true;
|
||||
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
|
||||
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
|
||||
given = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* if -host or -hostfile was given, and the slots were not,
|
||||
* then this is no longer allowed */
|
||||
if (!given &&
|
||||
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
|
||||
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
|
||||
/* inform the user of the error */
|
||||
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
|
||||
OPAL_LIST_DESTRUCT(&nodes);
|
||||
OBJ_RELEASE(caddy);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
|
||||
return;
|
||||
}
|
||||
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
|
||||
slots += node->slots;
|
||||
}
|
||||
}
|
||||
}
|
||||
app->num_procs = slots;
|
||||
OPAL_LIST_DESTRUCT(&nodes);
|
||||
|
@ -469,12 +469,19 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
continue;
|
||||
}
|
||||
if (node->slots > node->slots_inuse) {
|
||||
orte_std_cntr_t s;
|
||||
/* check for any -host allocations */
|
||||
if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
|
||||
s = orte_util_dash_host_compute_slots(node, hosts);
|
||||
} else {
|
||||
s = node->slots - node->slots_inuse;
|
||||
}
|
||||
/* add the available slots */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
|
||||
"%s node %s has %d slots available",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
node->name, node->slots - node->slots_inuse));
|
||||
num_slots += node->slots - node->slots_inuse;
|
||||
node->name, s));
|
||||
num_slots += s;
|
||||
continue;
|
||||
}
|
||||
if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||
@ -42,6 +42,36 @@
|
||||
|
||||
#include "dash_host.h"
|
||||
|
||||
int orte_util_dash_host_compute_slots(orte_node_t *node, char *hosts)
|
||||
{
|
||||
char **specs, *cptr;
|
||||
int slots = 0;
|
||||
int n;
|
||||
|
||||
specs = opal_argv_split(hosts, ',');
|
||||
|
||||
/* see if this node appears in the list */
|
||||
for (n=0; NULL != specs[n]; n++) {
|
||||
if (0 == strncmp(node->name, specs[n], strlen(node->name))) {
|
||||
/* check if the #slots was specified */
|
||||
if (NULL != (cptr = strchr(specs[n], ':'))) {
|
||||
*cptr = '\0';
|
||||
++cptr;
|
||||
if ('*' == *cptr || 0 == strcmp(cptr, "auto")) {
|
||||
slots += node->slots - node->slots_inuse;
|
||||
} else {
|
||||
slots += strtol(cptr, NULL, 10);
|
||||
}
|
||||
} else {
|
||||
++slots;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
opal_argv_free(specs);
|
||||
return slots;
|
||||
}
|
||||
|
||||
/* we can only enter this routine if no other allocation
|
||||
* was found, so we only need to know that finding any
|
||||
* relative node syntax should generate an immediate error
|
||||
@ -289,7 +319,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
||||
if (ORTE_FLAG_TEST(nd, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
|
||||
/* transfer across the number of slots */
|
||||
node->slots = nd->slots;
|
||||
node->slots += nd->slots;
|
||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
||||
}
|
||||
break;
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,6 +27,7 @@
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -41,6 +42,8 @@ ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
|
||||
ORTE_DECLSPEC int orte_util_get_ordered_dash_host_list(opal_list_t *nodes,
|
||||
char *hosts);
|
||||
|
||||
ORTE_DECLSPEC int orte_util_dash_host_compute_slots(orte_node_t *node, char *hosts);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user