diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index a178f42e84..1a5d6e594c 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1714,7 +1714,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s using dash_host", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&tnodes, hosts))) { + if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&tnodes, hosts, false))) { ORTE_ERROR_LOG(rc); free(hosts); return rc; diff --git a/orte/mca/ras/base/ras_base_allocate.c b/orte/mca/ras/base/ras_base_allocate.c index 353d1236f5..60212a4227 100644 --- a/orte/mca/ras/base/ras_base_allocate.c +++ b/orte/mca/ras/base/ras_base_allocate.c @@ -312,7 +312,7 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata) OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate adding dash_hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts))) { + if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, true))) { free(hosts); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); @@ -511,7 +511,7 @@ int orte_ras_base_add_hosts(orte_job_t *jdata) opal_output_verbose(5, orte_ras_base_framework.framework_output, "%s ras:base:add_hosts checking add-host %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts); - if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts))) { + if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, true))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); free(hosts); diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 6711643e35..5c41004541 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -171,7 +171,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using dash_host %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts)); - if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts))) { + if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, false))) { ORTE_ERROR_LOG(rc); free(hosts); return rc; diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index c4efb64987..e244cf64bf 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -155,6 +155,7 @@ bool orte_default_hostfile_given = false; char *orte_rankfile = NULL; int orte_num_allocated_nodes = 0; char *orte_node_regex = NULL; +char *orte_default_dash_host = NULL; /* tool communication controls */ bool orte_report_events = false; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index cda7d7db65..bfa4cd63af 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -540,6 +540,7 @@ ORTE_DECLSPEC extern bool orte_default_hostfile_given; ORTE_DECLSPEC extern char *orte_rankfile; ORTE_DECLSPEC extern int orte_num_allocated_nodes; ORTE_DECLSPEC extern char *orte_node_regex; +ORTE_DECLSPEC extern char *orte_default_dash_host; /* PMI version control */ ORTE_DECLSPEC extern int orted_pmi_version; diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index accf21eebf..a469d9c361 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -362,6 +362,14 @@ int orte_register_params(void) orte_default_hostfile_given = true; } + /* default dash-host */ + orte_default_dash_host = NULL; + (void) mca_base_var_register ("orte", "orte", NULL, "default_dash_host", + "Default -host setting, \"none\" to ignore environmental or default MCA param setting)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &orte_default_dash_host); + /* regex of nodes in system */ orte_node_regex = NULL; (void) mca_base_var_register ("orte", "orte", NULL, "node_regex", diff --git a/orte/tools/orte-submit/orte-submit.c b/orte/tools/orte-submit/orte-submit.c index 23656b08d5..35f3dc61af 100644 --- a/orte/tools/orte-submit/orte-submit.c +++ b/orte/tools/orte-submit/orte-submit.c @@ -1169,6 +1169,9 @@ static int create_app(int argc, char* argv[], orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_GLOBAL, tval, OPAL_STRING); opal_argv_free(targ); free(tval); + } else if (NULL != orte_default_dash_host) { + orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, + orte_default_dash_host, OPAL_STRING); } /* check for bozo error */ diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 023cdea8ab..8c7acc5e7e 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -1586,6 +1586,9 @@ static int create_app(int argc, char* argv[], orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING); opal_argv_free(targ); free(tval); + } else if (NULL != orte_default_dash_host) { + orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, + orte_default_dash_host, OPAL_STRING); } /* check for bozo error */ diff --git a/orte/util/dash_host/dash_host.c b/orte/util/dash_host/dash_host.c index f5d7a83683..edd71ef5d9 100644 --- a/orte/util/dash_host/dash_host.c +++ b/orte/util/dash_host/dash_host.c @@ -44,11 +44,11 @@ * relative node syntax should generate an immediate error */ int orte_util_add_dash_host_nodes(opal_list_t *nodes, - char *hosts) + char *hosts, bool allocating) { opal_list_item_t *item, *itm; orte_std_cntr_t i, j, k; - int rc; + int rc, nodeidx; char **host_argv=NULL; char **mapped_nodes = NULL, **mini_map, *ndname; orte_node_t *node, *nd; @@ -59,8 +59,8 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes, char *cptr; OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, - "%s dashhost: parsing args", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + "%s dashhost: parsing args %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts)); OBJ_CONSTRUCT(&adds, opal_list_t); host_argv = opal_argv_split(hosts, ','); @@ -85,6 +85,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes, } } opal_argv_free(host_argv); + mini_map = NULL; /* Did we find anything? If not, then do nothing */ if (NULL == mapped_nodes) { @@ -92,23 +93,96 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes, goto cleanup; } + for (i = 0; NULL != mapped_nodes[i]; ++i) { + /* if the specified node contains a relative node syntax, + * and we are allocating, then ignore it + */ + if ('+' == mapped_nodes[i][0]) { + if (!allocating) { + if ('e' == mapped_nodes[i][1] || + 'E' == mapped_nodes[i][1]) { + /* request for empty nodes - do they want + * all of them? + */ + if (NULL != (cptr = strchr(mapped_nodes[i], ':'))) { + /* the colon indicates a specific # are requested */ + ++cptr; + j = strtoul(cptr, NULL, 10); + } else if ('\0' != mapped_nodes[0][2]) { + j = strtoul(&mapped_nodes[0][2], NULL, 10); + } else { + /* add them all */ + j = orte_node_pool->size; + } + for (k=0; 0 < j && k < orte_node_pool->size; k++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, k))) { + if (0 == node->num_procs) { + opal_argv_append_nosize(&mini_map, node->name); + --j; + } + } + } + } else if ('n' == mapped_nodes[i][1] || + 'N' == mapped_nodes[i][1]) { + /* they want a specific relative node #, so + * look it up on global pool + */ + nodeidx = strtol(&mapped_nodes[i][2], NULL, 10); + if (nodeidx < 0 || + nodeidx > (int)orte_node_pool->size) { + /* this is an error */ + orte_show_help("help-dash-host.txt", "dash-host:relative-node-out-of-bounds", + true, nodeidx, mapped_nodes[i]); + rc = ORTE_ERR_SILENT; + goto cleanup; + } + /* if the HNP is not allocated, then we need to + * adjust the index as the node pool is offset + * by one + */ + if (!orte_hnp_is_allocated) { + nodeidx++; + } + /* see if that location is filled */ + + if (NULL == (node = (orte_node_t *) opal_pointer_array_get_item(orte_node_pool, nodeidx))) { + /* this is an error */ + orte_show_help("help-dash-host.txt", "dash-host:relative-node-not-found", + true, nodeidx, mapped_nodes[i]); + rc = ORTE_ERR_SILENT; + goto cleanup; + } + /* add this node to the list */ + opal_argv_append_nosize(&mini_map, node->name); + } else { + /* invalid relative node syntax */ + orte_show_help("help-dash-host.txt", "dash-host:invalid-relative-node-syntax", + true, mapped_nodes[i]); + rc = ORTE_ERR_SILENT; + goto cleanup; + } + } + } else { + /* just one node was given */ + opal_argv_append_nosize(&mini_map, mapped_nodes[i]); + } + } + if (NULL == mini_map) { + rc = ORTE_SUCCESS; + goto cleanup; + } + /* go through the names found and add them to the host list. If they're not unique, then bump the slots count for each duplicate */ + for (i=0; NULL != mini_map[i]; i++) { + OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, + "%s dashhost: working node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mini_map[i])); - for (i = 0; NULL != mapped_nodes[i]; ++i) { - /* if the specified node contains a relative node syntax, - * this is an error - */ - if ('+' == mapped_nodes[i][0]) { - orte_show_help("help-dash-host.txt", "dash-host:relative-syntax", - true, mapped_nodes[i]); - rc = ORTE_ERR_SILENT; - goto cleanup; - } /* see if the node contains the number of slots */ slots_given = false; - if (NULL != (cptr = strchr(mapped_nodes[i], ':'))) { + if (NULL != (cptr = strchr(mini_map[i], ':'))) { *cptr = '\0'; ++cptr; if ('*' == *cptr) { @@ -119,15 +193,11 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes, slots_given = true; } - OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, - "%s dashhost: working node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mapped_nodes[i])); - /* check for local name */ - if (orte_ifislocal(mapped_nodes[i])) { + if (orte_ifislocal(mini_map[i])) { ndname = orte_process_info.nodename; } else { - ndname = mapped_nodes[i]; + ndname = mini_map[i]; } /* see if the node is already on the list */ @@ -177,6 +247,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes, opal_list_append(&adds, &node->super); } } + opal_argv_free(mini_map); /* transfer across all unique nodes */ while (NULL != (item = opal_list_remove_first(&adds))) { diff --git a/orte/util/dash_host/dash_host.h b/orte/util/dash_host/dash_host.h index a11e36757e..abbdf505f9 100644 --- a/orte/util/dash_host/dash_host.h +++ b/orte/util/dash_host/dash_host.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,7 +31,8 @@ BEGIN_C_DECLS ORTE_DECLSPEC int orte_util_add_dash_host_nodes(opal_list_t *nodes, - char *hosts); + char *hosts, + bool allocating); ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes, char *hosts,