2005-03-14 20:57:21 +00:00
|
|
|
/*
|
2005-11-05 19:57:48 +00:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2008-02-28 05:32:23 +00:00
|
|
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
2005-11-05 19:57:48 +00:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2006-02-07 03:32:36 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-03-14 20:57:21 +00:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2012-12-14 17:00:44 +00:00
|
|
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
2014-06-01 16:14:10 +00:00
|
|
|
* reserved.
|
Per the discussion on the telecon, change the -host behavior so we only run one instance if no slots were provided and the user didn't specify #procs to run. However, if no slots are given and the user does specify #procs, then let the number of slots default to the #found processing elements
Ensure the returned exit status is non-zero if we fail to map
If no -np is given, but either -host and/or -hostfile was given, then error out with a message telling the user that this combination is not supported.
If -np is given, and -host is given with only one instance of each host, then default the #slots to the detected #pe's and enforce oversubscription rules.
If -np is given, and -host is given with more than one instance of a given host, then set the #slots for that host to the number of times it was given and enforce oversubscription rules. Alternatively, the #slots can be specified via "-host foo:N". I therefore believe that row #7 on Jeff's spreadsheet is incorrect.
With that one correction, this now passes all the given use-cases on that spreadsheet.
Make things behave under unmanaged allocations more like their managed cousins - if the #slots is given, then no-np shall fill things up.
Fixes #1344
2016-02-10 09:17:03 -08:00
|
|
|
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
2015-09-29 15:43:02 +09:00
|
|
|
* Copyright (c) 2015 Research Organization for Information Science
|
|
|
|
* and Technology (RIST). All rights reserved.
|
2005-03-14 20:57:21 +00:00
|
|
|
* $COPYRIGHT$
|
2006-02-07 03:32:36 +00:00
|
|
|
*
|
2005-03-14 20:57:21 +00:00
|
|
|
* Additional copyrights may follow
|
2006-02-07 03:32:36 +00:00
|
|
|
*
|
2005-03-14 20:57:21 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "orte_config.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/constants.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2005-03-29 13:50:15 +00:00
|
|
|
#include <string.h>
|
|
|
|
|
2005-09-20 13:32:08 +00:00
|
|
|
#include "opal/util/argv.h"
|
2008-04-23 14:52:09 +00:00
|
|
|
#include "opal/util/if.h"
|
2005-05-25 16:23:13 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
#include "orte/util/name_fns.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
2007-07-02 16:45:40 +00:00
|
|
|
|
2006-09-14 21:29:51 +00:00
|
|
|
#include "orte/mca/ras/base/ras_private.h"
|
2005-03-14 20:57:21 +00:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
/*
|
|
|
|
* Add the specified node definitions to the global data store
|
|
|
|
* NOTE: this removes all items from the list!
|
|
|
|
*/
|
|
|
|
int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
|
|
|
{
|
|
|
|
opal_list_item_t* item;
|
|
|
|
orte_std_cntr_t num_nodes;
|
2009-01-15 18:11:50 +00:00
|
|
|
int rc, i;
|
2016-05-29 18:56:18 -07:00
|
|
|
orte_node_t *node, *hnp_node, *nptr;
|
2011-12-01 14:24:43 +00:00
|
|
|
char *ptr;
|
2012-08-31 21:28:49 +00:00
|
|
|
bool hnp_alone = true;
|
2014-06-01 16:14:10 +00:00
|
|
|
orte_attribute_t *kv;
|
|
|
|
char **alias=NULL, **nalias;
|
2008-02-28 01:57:57 +00:00
|
|
|
|
|
|
|
/* get the number of nodes */
|
|
|
|
num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
|
|
|
|
if (0 == num_nodes) {
|
|
|
|
return ORTE_SUCCESS; /* nothing to do */
|
|
|
|
}
|
2015-06-23 20:59:57 -07:00
|
|
|
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
|
2008-02-28 01:57:57 +00:00
|
|
|
"%s ras:base:node_insert inserting %ld nodes",
|
2009-03-05 21:50:47 +00:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2008-02-28 01:57:57 +00:00
|
|
|
(long)num_nodes));
|
2015-06-23 20:59:57 -07:00
|
|
|
|
2016-05-29 18:56:18 -07:00
|
|
|
/* mark the job as being a large-cluster sim if that was requested */
|
|
|
|
if (1 < orte_ras_base.multiplier) {
|
|
|
|
orte_set_attribute(&jdata->attributes, ORTE_JOB_MULTI_DAEMON_SIM,
|
|
|
|
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
|
|
|
|
}
|
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
/* set the size of the global array - this helps minimize time
|
|
|
|
* spent doing realloc's
|
|
|
|
*/
|
2016-05-29 18:56:18 -07:00
|
|
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes * orte_ras_base.multiplier))) {
|
2008-02-28 01:57:57 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2015-06-23 20:59:57 -07:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
/* get the hnp node's info */
|
2010-07-19 18:30:04 +00:00
|
|
|
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
2012-12-14 17:00:44 +00:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
/* cycle through the list */
|
|
|
|
while (NULL != (item = opal_list_remove_first(nodes))) {
|
|
|
|
node = (orte_node_t*)item;
|
2015-06-23 20:59:57 -07:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
/* the HNP had to already enter its node on the array - that entry is in the
|
|
|
|
* first position since it is the first one entered. We need to check to see
|
|
|
|
* if this node is the same as the HNP's node so we don't double-enter it
|
|
|
|
*/
|
2015-03-16 16:25:01 -07:00
|
|
|
if (NULL != hnp_node && orte_ifislocal(node->name)) {
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
|
2012-08-31 21:28:49 +00:00
|
|
|
"%s ras:base:node_insert updating HNP [%s] info to %ld slots",
|
2009-03-05 21:50:47 +00:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2012-08-31 21:28:49 +00:00
|
|
|
node->name,
|
2008-02-28 01:57:57 +00:00
|
|
|
(long)node->slots));
|
2012-12-14 17:00:44 +00:00
|
|
|
|
2008-07-25 17:13:22 +00:00
|
|
|
/* flag that hnp has been allocated */
|
|
|
|
orte_hnp_is_allocated = true;
|
2012-12-14 17:00:44 +00:00
|
|
|
/* update the total slots in the job */
|
2015-04-30 20:33:43 -07:00
|
|
|
orte_ras_base.total_slots_alloc = node->slots;
|
2008-02-28 01:57:57 +00:00
|
|
|
/* copy the allocation data to that node's info */
|
2015-04-30 20:33:43 -07:00
|
|
|
hnp_node->slots = node->slots;
|
2008-02-28 01:57:57 +00:00
|
|
|
hnp_node->slots_max = node->slots_max;
|
2014-06-01 16:14:10 +00:00
|
|
|
/* copy across any attributes */
|
|
|
|
OPAL_LIST_FOREACH(kv, &node->attributes, orte_attribute_t) {
|
|
|
|
orte_set_attribute(&node->attributes, kv->key, ORTE_ATTR_LOCAL, &kv->data, kv->type);
|
|
|
|
}
|
|
|
|
if (orte_managed_allocation || ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
|
2012-09-07 04:08:17 +00:00
|
|
|
/* the slots are always treated as sacred
|
|
|
|
* in managed allocations
|
|
|
|
*/
|
2014-06-01 16:14:10 +00:00
|
|
|
ORTE_FLAG_SET(hnp_node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
2012-09-07 04:08:17 +00:00
|
|
|
} else {
|
2014-06-01 16:14:10 +00:00
|
|
|
ORTE_FLAG_UNSET(hnp_node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
2012-09-07 04:08:17 +00:00
|
|
|
}
|
2008-04-23 17:00:35 +00:00
|
|
|
/* use the local name for our node - don't trust what
|
2009-01-15 18:11:50 +00:00
|
|
|
* we got from an RM. If requested, store the resolved
|
2008-11-24 19:57:08 +00:00
|
|
|
* nodename info
|
2008-04-23 17:00:35 +00:00
|
|
|
*/
|
2009-01-15 18:11:50 +00:00
|
|
|
if (orte_show_resolved_nodenames) {
|
|
|
|
/* if the node name is different, store it as an alias */
|
|
|
|
if (0 != strcmp(node->name, hnp_node->name)) {
|
2014-06-01 16:14:10 +00:00
|
|
|
/* get any current list of aliases */
|
|
|
|
ptr = NULL;
|
|
|
|
orte_get_attribute(&hnp_node->attributes, ORTE_NODE_ALIAS, (void**)&ptr, OPAL_STRING);
|
|
|
|
if (NULL != ptr) {
|
|
|
|
alias = opal_argv_split(ptr, ',');
|
|
|
|
free(ptr);
|
|
|
|
}
|
2009-01-15 18:11:50 +00:00
|
|
|
/* add to list of aliases for this node - only add if unique */
|
2014-06-01 16:14:10 +00:00
|
|
|
opal_argv_append_unique_nosize(&alias, node->name, false);
|
2009-01-15 18:11:50 +00:00
|
|
|
}
|
2014-06-01 16:14:10 +00:00
|
|
|
if (orte_get_attribute(&node->attributes, ORTE_NODE_ALIAS, (void**)&ptr, OPAL_STRING)) {
|
|
|
|
nalias = opal_argv_split(ptr, ',');
|
2009-01-15 18:11:50 +00:00
|
|
|
/* now copy over any aliases that are unique */
|
2014-06-01 16:14:10 +00:00
|
|
|
for (i=0; NULL != nalias[i]; i++) {
|
|
|
|
opal_argv_append_unique_nosize(&alias, nalias[i], false);
|
2009-01-15 18:11:50 +00:00
|
|
|
}
|
2014-06-01 16:14:10 +00:00
|
|
|
opal_argv_free(nalias);
|
2008-11-24 19:57:08 +00:00
|
|
|
}
|
2014-06-01 16:14:10 +00:00
|
|
|
/* and store the result */
|
|
|
|
ptr = opal_argv_join(alias, ',');
|
|
|
|
opal_argv_free(alias);
|
|
|
|
orte_set_attribute(&hnp_node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, ptr, OPAL_STRING);
|
|
|
|
free(ptr);
|
2008-11-24 19:57:08 +00:00
|
|
|
}
|
2008-02-28 01:57:57 +00:00
|
|
|
/* don't keep duplicate copy */
|
|
|
|
OBJ_RELEASE(node);
|
2016-05-29 18:56:18 -07:00
|
|
|
/* create copies, if required */
|
|
|
|
for (i=1; i < orte_ras_base.multiplier; i++) {
|
|
|
|
opal_dss.copy((void**)&node, hnp_node, ORTE_NODE);
|
|
|
|
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
|
|
|
|
node->index = opal_pointer_array_add(orte_node_pool, node);
|
|
|
|
}
|
2008-02-28 01:57:57 +00:00
|
|
|
} else {
|
|
|
|
/* insert the object onto the orte_nodes global array */
|
2013-03-27 21:14:43 +00:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
|
2014-01-02 16:07:16 +00:00
|
|
|
"%s ras:base:node_insert node %s slots %d",
|
2009-03-05 21:50:47 +00:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
2014-01-02 16:07:16 +00:00
|
|
|
(NULL == node->name) ? "NULL" : node->name,
|
|
|
|
node->slots));
|
2012-09-07 04:08:17 +00:00
|
|
|
if (orte_managed_allocation) {
|
|
|
|
/* the slots are always treated as sacred
|
|
|
|
* in managed allocations
|
|
|
|
*/
|
2014-06-01 16:14:10 +00:00
|
|
|
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
|
2012-09-07 04:08:17 +00:00
|
|
|
}
|
2008-07-25 13:35:12 +00:00
|
|
|
/* insert it into the array */
|
2008-02-28 05:32:23 +00:00
|
|
|
node->index = opal_pointer_array_add(orte_node_pool, (void*)node);
|
|
|
|
if (ORTE_SUCCESS > (rc = node->index)) {
|
2008-02-28 01:57:57 +00:00
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
/* update the total slots in the job */
|
2012-09-20 02:50:14 +00:00
|
|
|
orte_ras_base.total_slots_alloc += node->slots;
|
2011-12-01 14:24:43 +00:00
|
|
|
/* check if we have fqdn names in the allocation */
|
|
|
|
if (NULL != strchr(node->name, '.')) {
|
|
|
|
orte_have_fqdn_allocation = true;
|
|
|
|
}
|
2012-08-31 21:28:49 +00:00
|
|
|
/* indicate the HNP is not alone */
|
|
|
|
hnp_alone = false;
|
2016-05-29 18:56:18 -07:00
|
|
|
for (i=1; i < orte_ras_base.multiplier; i++) {
|
|
|
|
opal_dss.copy((void**)&nptr, node, ORTE_NODE);
|
|
|
|
nptr->index = opal_pointer_array_add(orte_node_pool, nptr);
|
|
|
|
}
|
|
|
|
}
|
2011-12-01 14:24:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* if we didn't find any fqdn names in the allocation, then
|
|
|
|
* ensure we don't have any domain info in the node record
|
|
|
|
* for the hnp
|
|
|
|
*/
|
2012-08-31 21:28:49 +00:00
|
|
|
if (!orte_have_fqdn_allocation && !hnp_alone) {
|
2011-12-01 14:24:43 +00:00
|
|
|
if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
|
|
|
|
*ptr = '\0';
|
2008-02-28 01:57:57 +00:00
|
|
|
}
|
|
|
|
}
|
2015-06-23 20:59:57 -07:00
|
|
|
|
2008-02-28 01:57:57 +00:00
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|