From d772e0fc3d2afa356b680f6e4e267f70b45503b9 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 5 Sep 2012 18:42:09 +0000 Subject: [PATCH] Add an option to treat dash-host specifications as "requested, but not required". So-called "soft" location requests can allow an application to execute even if the ideal allocation isn't available. This commit was SVN r27242. --- orte/mca/ras/base/ras_base_allocate.c | 9 ++++- orte/mca/rmaps/base/rmaps_base_support_fns.c | 8 ++-- orte/mca/rmaps/staged/rmaps_staged.c | 42 +++++++++++++++++++- orte/mca/rmaps/staged/rmaps_staged.h | 1 - orte/runtime/orte_globals.c | 1 + orte/runtime/orte_globals.h | 1 + orte/runtime/orte_mca_params.c | 8 ++++ 7 files changed, 64 insertions(+), 6 deletions(-) diff --git a/orte/mca/ras/base/ras_base_allocate.c b/orte/mca/ras/base/ras_base_allocate.c index 498fed3ab6..647a86c4e9 100644 --- a/orte/mca/ras/base/ras_base_allocate.c +++ b/orte/mca/ras/base/ras_base_allocate.c @@ -233,7 +233,14 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata) OBJ_RELEASE(caddy); return; } - } else if (NULL != app->dash_host) { + } else if (!orte_soft_locations && NULL != app->dash_host) { + /* if we are using soft locations, then any dash-host would + * just include desired nodes and not required. We don't want + * to pick them up here as this would mean the request was + * always satisfied - instead, we want to allow the request + * to fail later on and use whatever nodes are actually + * available + */ OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:allocate adding dash_hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 18837095bb..b458f06ccc 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -147,8 +147,10 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr */ if (!orte_managed_allocation) { OBJ_CONSTRUCT(&nodes, opal_list_t); - /* if the app provided a dash-host, then use those nodes */ - if (NULL != app->dash_host) { + /* if the app provided a dash-host, and we are not treating + * them as requested or "soft" locations, then use those nodes + */ + if (!orte_soft_locations && NULL != app->dash_host) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s using dash_host", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -441,7 +443,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s Filtering thru apps", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - + if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, allocated_nodes, true)) && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); diff --git a/orte/mca/rmaps/staged/rmaps_staged.c b/orte/mca/rmaps/staged/rmaps_staged.c index 6080967e06..c9232b6892 100644 --- a/orte/mca/rmaps/staged/rmaps_staged.c +++ b/orte/mca/rmaps/staged/rmaps_staged.c @@ -41,7 +41,7 @@ orte_rmaps_base_module_t orte_rmaps_staged_module = { static int staged_mapper(orte_job_t *jdata) { mca_base_component_t *c=&mca_rmaps_staged_component.base_version; - int i, j, rc; + int i, j, k, rc; orte_app_context_t *app; opal_list_t node_list; orte_std_cntr_t num_slots; @@ -49,6 +49,7 @@ static int staged_mapper(orte_job_t *jdata) orte_node_t *node; bool work_to_do = false, first_pass = false; opal_list_item_t *item; + char *cptr, **minimap; /* only use this mapper if it was specified */ if (NULL == jdata->map->req_mapper || @@ -101,8 +102,12 @@ static int staged_mapper(orte_job_t *jdata) * -hostfile or -host directives */ OBJ_CONSTRUCT(&node_list, opal_list_t); + /* get nodes based on a strict interpretation of the location hints */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, jdata->map->mapping, false, true))) { + /* we were unable to get any nodes that match those + * specified in the app + */ if (ORTE_ERR_RESOURCE_BUSY == rc) { /* if the return is "busy", then at least one of the * specified resources must exist, but no slots are @@ -127,6 +132,41 @@ static int staged_mapper(orte_job_t *jdata) return rc; } } + /* if we are using soft locations, search the list of nodes + * for those that match the requested locations and bubble those + * to the top so we use them first + */ + if (orte_soft_locations && NULL != app->dash_host) { + /* scan the dash hosts in reverse order as we want + * the first entry to be on top of the list + */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "%s mca:rmaps:staged: ordering nodes by desired location", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + for (j=opal_argv_count(app->dash_host)-1; 0 <= j; j--) { + minimap = opal_argv_split(app->dash_host[j], ','); + for (k=opal_argv_count(minimap)-1; 0 <= k; k--) { + cptr = minimap[k]; + for (item = opal_list_get_first(&node_list); + item != opal_list_get_end(&node_list); + item = opal_list_get_next(item)) { + node = (orte_node_t*)item; + if (0 == strcmp(node->name, cptr) || + (0 == strcmp("localhost", cptr) && + 0 == strcmp(node->name, orte_process_info.nodename))) { + opal_list_remove_item(&node_list, item); + opal_list_prepend(&node_list, item); + opal_output_verbose(10, orte_rmaps_base.rmaps_output, + "%s mca:rmaps:staged: placing node %s at top of list", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name); + break; + } + } + } + opal_argv_free(minimap); + } + } /* assign any unmapped procs to an available slot */ for (j=0; j < app->procs.size; j++) { diff --git a/orte/mca/rmaps/staged/rmaps_staged.h b/orte/mca/rmaps/staged/rmaps_staged.h index 9c81886b24..38ee8528cc 100644 --- a/orte/mca/rmaps/staged/rmaps_staged.h +++ b/orte/mca/rmaps/staged/rmaps_staged.h @@ -20,7 +20,6 @@ BEGIN_C_DECLS ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_staged_component; extern orte_rmaps_base_module_t orte_rmaps_staged_module; - END_C_DECLS #endif diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index a45fdab928..d9da1e1a45 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -98,6 +98,7 @@ bool orte_managed_allocation = false; char *orte_set_slots = NULL; bool orte_display_allocation; bool orte_display_devel_allocation; +bool orte_soft_locations = false; /* launch agents */ char *orte_launch_agent = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 9aaac21b90..55684c0194 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -617,6 +617,7 @@ ORTE_DECLSPEC extern bool orte_managed_allocation; ORTE_DECLSPEC extern char *orte_set_slots; ORTE_DECLSPEC extern bool orte_display_allocation; ORTE_DECLSPEC extern bool orte_display_devel_allocation; +ORTE_DECLSPEC extern bool orte_soft_locations; /* launch agents */ ORTE_DECLSPEC extern char *orte_launch_agent; diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index aeb629ca19..515b20abe3 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -576,6 +576,14 @@ int orte_register_params(void) orte_devel_level_output = true; } + /* should we treat any -host directives as "soft" - i.e., desired + * but not required + */ + mca_base_param_reg_int_name("orte", "soft_locations", + "Treat -host directives as desired, but not required", + false, false, (int)false, &value); + orte_soft_locations = OPAL_INT_TO_BOOL(value); + #endif /* ORTE_DISABLE_FULL_SUPPORT */ return ORTE_SUCCESS;