From 5120e6aec320e3958fdee3e87d31ffeba17ff651 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 15 Feb 2011 23:24:31 +0000 Subject: [PATCH] Redefine the rmaps framework to allow multiple mapper modules to be active at the same time. This allows users to map the primary job one way, and map any comm_spawn'd job in a different way. Modules are given the opportunity to map a job in priority order, with the round-robin mapper having the highest default priority. Priority of each module can be defined using mca param. When called, each mapper checks to see if it can map the job. If npernode is provided, for example, then the loadbalance mapper accepts the assignment and performs the operation - all mappers before it will "pass" as they can't map npernode requests. Also remove the stale and never completed topo mapper. This commit was SVN r24393. --- orte/include/orte/constants.h | 4 +- orte/mca/plm/base/plm_base_launch_support.c | 3 + orte/mca/rmaps/base/base.h | 16 +- orte/mca/rmaps/base/help-orte-rmaps-base.txt | 7 + orte/mca/rmaps/base/rmaps_base_close.c | 8 + orte/mca/rmaps/base/rmaps_base_map_job.c | 40 +- orte/mca/rmaps/base/rmaps_base_open.c | 18 +- orte/mca/rmaps/base/rmaps_base_select.c | 97 +- orte/mca/rmaps/base/rmaps_base_support_fns.c | 49 +- orte/mca/rmaps/load_balance/rmaps_lb.c | 24 +- .../rmaps/load_balance/rmaps_lb_component.c | 29 +- orte/mca/rmaps/rank_file/rmaps_rank_file.c | 19 + orte/mca/rmaps/rank_file/rmaps_rank_file.h | 9 +- .../rank_file/rmaps_rank_file_component.c | 36 +- orte/mca/rmaps/resilient/rmaps_resilient.c | 1010 ++++++++++------- .../resilient/rmaps_resilient_component.c | 16 +- orte/mca/rmaps/rmaps_types.h | 11 + orte/mca/rmaps/round_robin/rmaps_rr.c | 28 +- .../rmaps/round_robin/rmaps_rr_component.c | 9 +- orte/mca/rmaps/seq/rmaps_seq.c | 22 + orte/mca/rmaps/seq/rmaps_seq_component.c | 13 +- orte/mca/rmaps/topo/.windows | 12 - orte/mca/rmaps/topo/Makefile.am | 48 - orte/mca/rmaps/topo/help-orte-rmaps-topo.txt | 53 - orte/mca/rmaps/topo/rmaps_topo.c | 546 --------- orte/mca/rmaps/topo/rmaps_topo.h | 37 - orte/mca/rmaps/topo/rmaps_topo_component.c | 85 -- .../data_type_support/orte_dt_packing_fns.c | 6 + .../data_type_support/orte_dt_print_fns.c | 4 +- .../data_type_support/orte_dt_unpacking_fns.c | 8 + orte/runtime/orte_globals.c | 1 + orte/util/error_strings.c | 11 +- 32 files changed, 936 insertions(+), 1343 deletions(-) delete mode 100644 orte/mca/rmaps/topo/.windows delete mode 100644 orte/mca/rmaps/topo/Makefile.am delete mode 100644 orte/mca/rmaps/topo/help-orte-rmaps-topo.txt delete mode 100644 orte/mca/rmaps/topo/rmaps_topo.c delete mode 100644 orte/mca/rmaps/topo/rmaps_topo.h delete mode 100644 orte/mca/rmaps/topo/rmaps_topo_component.c diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index 8994dae31d..9ad49df2e5 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -120,7 +120,9 @@ enum { ORTE_ERR_PROC_STALLED = (ORTE_ERR_BASE - 38), ORTE_ERR_NO_APP_SPECIFIED = (ORTE_ERR_BASE - 39), ORTE_ERR_NO_EXE_SPECIFIED = (ORTE_ERR_BASE - 40), - ORTE_ERR_COMM_DISABLED = (ORTE_ERR_BASE - 41) + ORTE_ERR_COMM_DISABLED = (ORTE_ERR_BASE - 41), + ORTE_ERR_FAILED_TO_MAP = (ORTE_ERR_BASE - 42), + ORTE_ERR_TAKE_NEXT_OPTION = (ORTE_ERR_BASE - 43) }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 7244e72921..9a5a4545db 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -93,6 +93,9 @@ int orte_plm_base_setup_job(orte_job_t *jdata) ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); + /* set the job state */ + jdata->state = ORTE_JOB_STATE_INIT; + /* if job recovery is not defined, set it to default */ if (!jdata->recovery_defined) { /* set to system default */ diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index af26d1d877..dce543c88d 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -56,8 +56,10 @@ typedef struct { int rmaps_output; /** List of available components */ opal_list_t available_components; - /** selected module */ - orte_rmaps_base_module_t *active_module; + /* list of selected modules */ + opal_list_t selected_modules; + /* desired default mapper */ + int default_mapper; /** whether or not we allow oversubscription of nodes */ bool oversubscribe; /** number of ppn for n_per_node mode */ @@ -74,8 +76,6 @@ typedef struct { bool no_use_local; /* display the map after it is computed */ bool display_map; - /* balance load across nodes */ - bool loadbalance; /* slot list, if provided by user */ char *slot_list; } orte_rmaps_base_t; @@ -88,6 +88,14 @@ ORTE_DECLSPEC extern orte_rmaps_base_t orte_rmaps_base; /** * Select an rmaps component / module */ +typedef struct { + opal_list_item_t super; + int pri; + orte_rmaps_base_module_t *module; + mca_base_component_t *component; +} orte_rmaps_base_selected_module_t; +OBJ_CLASS_DECLARATION(orte_rmaps_base_selected_module_t); + ORTE_DECLSPEC int orte_rmaps_base_select(void); /** diff --git a/orte/mca/rmaps/base/help-orte-rmaps-base.txt b/orte/mca/rmaps/base/help-orte-rmaps-base.txt index 839998870a..70182e2d31 100644 --- a/orte/mca/rmaps/base/help-orte-rmaps-base.txt +++ b/orte/mca/rmaps/base/help-orte-rmaps-base.txt @@ -87,3 +87,10 @@ are cpus in a socket: #cpus/socket: %d Please correct one or both of these values and try again. +# +[failed-map] +Your job failed to map. Either no mapper was available, or none +of the available mappers was able to perform the requested +mapping operation. This can happen if you request a map type +(e.g., loadbalance) and the corresponding mapper was not built. + diff --git a/orte/mca/rmaps/base/rmaps_base_close.c b/orte/mca/rmaps/base/rmaps_base_close.c index 89b797aaff..56ebb73043 100644 --- a/orte/mca/rmaps/base/rmaps_base_close.c +++ b/orte/mca/rmaps/base/rmaps_base_close.c @@ -28,6 +28,14 @@ int orte_rmaps_base_close(void) { + opal_list_item_t *item; + + /* cleanup globals */ + while (NULL != (item = opal_list_remove_first(&orte_rmaps_base.selected_modules))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&orte_rmaps_base.selected_modules); + mca_base_components_close(orte_rmaps_base.rmaps_output, &orte_rmaps_base.available_components, NULL); diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 93ea2f844d..c1ac6b1de6 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -25,10 +25,11 @@ #include "opal/util/output.h" #include "opal/util/opal_sos.h" #include "opal/mca/base/base.h" - #include "opal/dss/dss.h" + #include "orte/mca/errmgr/errmgr.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/show_help.h" #include "orte/mca/rmaps/base/base.h" #include "orte/mca/rmaps/base/rmaps_private.h" @@ -42,7 +43,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) { orte_job_map_t *map; int rc; - + bool did_map; + opal_list_item_t *item; + orte_rmaps_base_selected_module_t *mod; + /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP @@ -76,12 +80,16 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) map->stride = orte_rmaps_base.stride; map->oversubscribe = orte_rmaps_base.oversubscribe; map->display_map = orte_rmaps_base.display_map; + map->mapper = orte_rmaps_base.default_mapper; /* assign the map object to this job */ jdata->map = map; } else { if (!jdata->map->display_map) { jdata->map->display_map = orte_rmaps_base.display_map; } + if (ORTE_RMAPS_UNDEF == jdata->map->mapper) { + jdata->map->mapper = orte_rmaps_base.default_mapper; + } } /* if the job is the daemon job, then we are just mapping daemons and @@ -93,10 +101,30 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) return rc; } } else { - /* go ahead and map the job */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base.active_module->map_job(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; + /* cycle thru the available mappers until one agrees to map + * the job + */ + did_map = false; + for (item = opal_list_get_first(&orte_rmaps_base.selected_modules); + item != opal_list_get_end(&orte_rmaps_base.selected_modules); + item = opal_list_get_next(item)) { + mod = (orte_rmaps_base_selected_module_t*)item; + if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) { + did_map = true; + break; + } + /* mappers return "next option" if they didn't attempt to + * map the job. anything else is a true error. + */ + if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* if we get here without doing the map, then that's an error */ + if (!did_map) { + orte_show_help("help-orte-rmaps-base.txt", "failed-map", true); + return ORTE_ERR_FAILED_TO_MAP; } } diff --git a/orte/mca/rmaps/base/rmaps_base_open.c b/orte/mca/rmaps/base/rmaps_base_open.c index 4e53d29af9..b47af0e7de 100644 --- a/orte/mca/rmaps/base/rmaps_base_open.c +++ b/orte/mca/rmaps/base/rmaps_base_open.c @@ -87,8 +87,9 @@ int orte_rmaps_base_open(void) bool btmp; /* init the globals */ - orte_rmaps_base.active_module = NULL; - + OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); + orte_rmaps_base.default_mapper = ORTE_RMAPS_UNDEF; + /* Debugging / verbose output. Always have stream open, with verbose set by the mca open system... */ orte_rmaps_base.rmaps_output = opal_output_open(NULL); @@ -118,6 +119,7 @@ int orte_rmaps_base_open(void) false, false, (int)false, &value); if (value) { orte_rmaps_base.npernode = 1; + orte_rmaps_base.default_mapper = ORTE_RMAPS_LOADBALANCE; } /* #procs/node */ @@ -126,6 +128,7 @@ int orte_rmaps_base_open(void) false, false, -1, &value); if (0 < value) { orte_rmaps_base.npernode = value; + orte_rmaps_base.default_mapper = ORTE_RMAPS_LOADBALANCE; } /* #procs/board */ @@ -134,6 +137,7 @@ int orte_rmaps_base_open(void) false, false, -1, &orte_rmaps_base.nperboard); if (0 < orte_rmaps_base.nperboard) { ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX); + orte_rmaps_base.default_mapper = ORTE_RMAPS_LOADBALANCE; } /* #procs/socket */ @@ -144,13 +148,16 @@ int orte_rmaps_base_open(void) ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX); /* force bind to socket if not overridden by user */ ORTE_XSET_BINDING_POLICY(ORTE_BIND_TO_SOCKET); + orte_rmaps_base.default_mapper = ORTE_RMAPS_LOADBALANCE; } /* Do we want to loadbalance the job */ param = mca_base_param_reg_int_name("rmaps", "base_loadbalance", "Balance total number of procs across all allocated nodes", false, false, (int)false, &value); - orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value); + if (value) { + orte_rmaps_base.default_mapper = ORTE_RMAPS_LOADBALANCE; + } /* #cpus/rank to use */ param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_proc", @@ -193,6 +200,7 @@ int orte_rmaps_base_open(void) if (NULL != orte_rmaps_base.slot_list || NULL != orte_rankfile) { ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_BYUSER); + orte_rmaps_base.default_mapper = ORTE_RMAPS_RF; } /* Should we schedule on the local node or not? */ @@ -243,4 +251,8 @@ int orte_rmaps_base_open(void) return ORTE_SUCCESS; } +OBJ_CLASS_INSTANCE(orte_rmaps_base_selected_module_t, + opal_list_item_t, + NULL, NULL); + #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/mca/rmaps/base/rmaps_base_select.c b/orte/mca/rmaps/base/rmaps_base_select.c index cf8d39f22e..9898351df1 100644 --- a/orte/mca/rmaps/base/rmaps_base_select.c +++ b/orte/mca/rmaps/base/rmaps_base_select.c @@ -26,29 +26,98 @@ #include "orte/mca/rmaps/base/base.h" +static bool selected = false; + /* * Function for selecting one component from all those that are * available. */ int orte_rmaps_base_select(void) { - orte_rmaps_base_component_t *best_component = NULL; - orte_rmaps_base_module_t *best_module = NULL; + opal_list_item_t *item, *itm2; + mca_base_component_list_item_t *cli = NULL; + mca_base_component_t *component = NULL; + mca_base_module_t *module = NULL; + orte_rmaps_base_module_t *nmodule; + orte_rmaps_base_selected_module_t *newmodule, *mod; + int rc, priority; + bool inserted; - /* - * Select the best component - */ - if( OPAL_SUCCESS != mca_base_select("rmaps", orte_rmaps_base.rmaps_output, - &orte_rmaps_base.available_components, - (mca_base_module_t **) &best_module, - (mca_base_component_t **) &best_component) ) { - /* This will only happen if no component was selected */ - return ORTE_ERR_NOT_FOUND; + if (selected) { + /* ensure we don't do this twice */ + return ORTE_SUCCESS; + } + selected = true; + + /* Query all available components and ask if they have a module */ + for (item = opal_list_get_first(&orte_rmaps_base.available_components); + opal_list_get_end(&orte_rmaps_base.available_components) != item; + item = opal_list_get_next(item)) { + cli = (mca_base_component_list_item_t *) item; + component = (mca_base_component_t *) cli->cli_component; + + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:select: checking available component %s", component->mca_component_name); + + /* If there's no query function, skip it */ + if (NULL == component->mca_query_component) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:select: Skipping component [%s]. It does not implement a query function", + component->mca_component_name ); + continue; + } + + /* Query the component */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:select: Querying component [%s]", + component->mca_component_name); + rc = component->mca_query_component(&module, &priority); + + /* If no module was returned, then skip component */ + if (ORTE_SUCCESS != rc || NULL == module) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:select: Skipping component [%s]. Query failed to return a module", + component->mca_component_name ); + continue; + } + + /* If we got a module, keep it */ + nmodule = (orte_rmaps_base_module_t*) module; + /* add to the list of selected modules */ + newmodule = OBJ_NEW(orte_rmaps_base_selected_module_t); + newmodule->pri = priority; + newmodule->module = nmodule; + newmodule->component = component; + + /* maintain priority order */ + inserted = false; + for (itm2 = opal_list_get_first(&orte_rmaps_base.selected_modules); + itm2 != opal_list_get_end(&orte_rmaps_base.selected_modules); + itm2 = opal_list_get_next(itm2)) { + mod = (orte_rmaps_base_selected_module_t*)itm2; + if (priority > mod->pri) { + opal_list_insert_pos(&orte_rmaps_base.selected_modules, + itm2, &newmodule->super); + inserted = true; + break; + } + } + if (!inserted) { + /* must be lowest priority - add to end */ + opal_list_append(&orte_rmaps_base.selected_modules, &newmodule->super); + } } - /* Save the winner */ - /* No global component structure */ - orte_rmaps_base.active_module = best_module; + if (4 < opal_output_get_verbosity(orte_rmaps_base.rmaps_output)) { + opal_output(0, "%s: Final mapper priorities", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + /* show the prioritized list */ + for (itm2 = opal_list_get_first(&orte_rmaps_base.selected_modules); + itm2 != opal_list_get_end(&orte_rmaps_base.selected_modules); + itm2 = opal_list_get_next(itm2)) { + mod = (orte_rmaps_base_selected_module_t*)itm2; + opal_output(0, "\tMapper: %s Priority: %d", mod->component->mca_component_name, mod->pri); + } + } return ORTE_SUCCESS;; } diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 5f04588777..be125bf143 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -60,12 +60,12 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* if the hnp was allocated, include it unless flagged not to */ if (orte_hnp_is_allocated) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { - if (ORTE_NODE_STATE_UP == node->state) { + if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { + /* clear this for future use, but don't include it */ + node->state = ORTE_NODE_STATE_UP; + } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) { OBJ_RETAIN(node); opal_list_append(allocated_nodes, &node->super); - } else if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { - /* clear this for future use */ - node->state = ORTE_NODE_STATE_UP; } } } @@ -73,16 +73,19 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* add everything in the node pool that can be used */ for (i=1; i < orte_node_pool->size; i++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - /* ignore nodes that are "down" */ - if (ORTE_NODE_STATE_DOWN == node->state) { - continue; - } /* ignore nodes that are marked as do-not-use for this mapping */ if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { /* reset the state so it can be used another time */ node->state = ORTE_NODE_STATE_UP; continue; } + if (ORTE_NODE_STATE_DOWN == node->state) { + continue; + } + if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) { + /* not to be used */ + continue; + } /* retain a copy for our use in case the item gets * destructed along the way */ @@ -247,36 +250,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr return ORTE_SUCCESS; } - /* if we are mapping an application, check to see if we are to - * use a virtual machine - */ - if (policy & ORTE_MAPPING_USE_VM) { - /* remove all nodes that do NOT have an "alive" daemon on them */ - item = opal_list_get_first(allocated_nodes); - while (item != opal_list_get_end(allocated_nodes)) { - - /** save the next pointer in case we remove this node */ - next = opal_list_get_next(item); - - /** already have a daemon? */ - node = (orte_node_t*)item; - if (NULL == node->daemon || - ORTE_PROC_STATE_RUNNING != node->daemon->state) { - opal_list_remove_item(allocated_nodes, item); - OBJ_RELEASE(item); /* "un-retain" it */ - } - - /** go on to next item */ - item = next; - } - /** check that anything is left! */ - if (0 == opal_list_get_size(allocated_nodes)) { - orte_show_help("help-orte-rmaps-base.txt", - "orte-rmaps-base:nolocal-no-available-resources", true); - return ORTE_ERR_SILENT; - } - } - /* remove all nodes that are already at max usage, and * compute the total number of allocated slots while * we do so diff --git a/orte/mca/rmaps/load_balance/rmaps_lb.c b/orte/mca/rmaps/load_balance/rmaps_lb.c index f899054513..32ae331c6b 100644 --- a/orte/mca/rmaps/load_balance/rmaps_lb.c +++ b/orte/mca/rmaps/load_balance/rmaps_lb.c @@ -55,6 +55,27 @@ static int switchyard(orte_job_t *jdata) { int rc; + /* only handle initial launch of loadbalanced + * or NPERxxx jobs - allow restarting of failed apps + */ + if (ORTE_JOB_STATE_INIT != jdata->state) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:lb: not job %s not in initial state - loadbalance cannot map", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + if (0 < jdata->map->mapper && ORTE_RMAPS_LOADBALANCE != jdata->map->mapper) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:lb: job %s not using loadbalance mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:loadbalance: mapping job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + if (0 < orte_rmaps_base.npernode) { rc = npernode(jdata); } else if (0 < orte_rmaps_base.nperboard) { @@ -78,10 +99,9 @@ static int switchyard(orte_job_t *jdata) /* define the daemons that we will use for this job */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) { ORTE_ERROR_LOG(rc); - return rc; } - return ORTE_SUCCESS; + return rc; } diff --git a/orte/mca/rmaps/load_balance/rmaps_lb_component.c b/orte/mca/rmaps/load_balance/rmaps_lb_component.c index abdc3007df..dd1c3a5de9 100644 --- a/orte/mca/rmaps/load_balance/rmaps_lb_component.c +++ b/orte/mca/rmaps/load_balance/rmaps_lb_component.c @@ -33,6 +33,7 @@ static int orte_rmaps_lb_open(void); static int orte_rmaps_lb_close(void); static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority); +static int my_priority; orte_rmaps_base_component_t mca_rmaps_load_balance_component = { { @@ -58,30 +59,22 @@ orte_rmaps_base_component_t mca_rmaps_load_balance_component = { */ static int orte_rmaps_lb_open(void) { + mca_base_component_t *c = &mca_rmaps_load_balance_component.base_version; + + mca_base_param_reg_int(c, "priority", + "Priority of the loadbalance rmaps component", + false, false, 80, + &my_priority); return ORTE_SUCCESS; } static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority) { - /* the RMAPS framework is -only- opened on HNP's, - * so no need to check for that here - */ - - /* if load balancing, or any nperxxx, was requested, then we must be selected */ - if (orte_rmaps_base.loadbalance || - 0 < orte_rmaps_base.npernode || - 0 < orte_rmaps_base.nperboard || - 0 < orte_rmaps_base.npersocket) { - *priority = 1000; /* must be selected */ - *module = (mca_base_module_t *)&orte_rmaps_load_balance_module; - return ORTE_SUCCESS; - } - - /* otherwise, ignore us */ - *priority = 0; - *module = NULL; - return ORTE_ERROR; + /* after rr */ + *priority = my_priority; + *module = (mca_base_module_t *)&orte_rmaps_load_balance_module; + return ORTE_SUCCESS; } /** diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index 9c7ca40ad9..7ce0e6d017 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -294,6 +294,25 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) int rc; orte_proc_t *proc; + /* only handle initial launch of rf job */ + if (ORTE_JOB_STATE_INIT != jdata->state) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:rf: not job %s not in initial state - rank_file cannot map", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + if (0 < jdata->map->mapper && ORTE_RMAPS_RF != jdata->map->mapper) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:rf: job %s not using rank_file mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:rank_file: mapping job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* convenience def */ map = jdata->map; diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.h b/orte/mca/rmaps/rank_file/rmaps_rank_file.h index 5481801196..eda69e4691 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.h +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.h @@ -39,14 +39,7 @@ BEGIN_C_DECLS /** * RMGR Component */ -struct orte_rmaps_rank_file_component_t { - orte_rmaps_base_component_t super; - int debug; - int priority; -}; -typedef struct orte_rmaps_rank_file_component_t orte_rmaps_rank_file_component_t; - -ORTE_MODULE_DECLSPEC extern orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component; +ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_rank_file_component; extern orte_rmaps_base_module_t orte_rmaps_rank_file_module; diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c b/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c index e9d6519f0b..4adde92b6f 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file_component.c @@ -43,12 +43,13 @@ static int orte_rmaps_rank_file_open(void); static int orte_rmaps_rank_file_close(void); static int orte_rmaps_rank_file_query(mca_base_module_t **module, int *priority); -orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component = { - { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ +static int my_priority; - { +orte_rmaps_base_component_t mca_rmaps_rank_file_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { ORTE_RMAPS_BASE_VERSION_2_0_0, "rank_file", /* MCA component name */ @@ -58,11 +59,10 @@ orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component = { orte_rmaps_rank_file_open, /* component open */ orte_rmaps_rank_file_close, /* component close */ orte_rmaps_rank_file_query /* component query */ - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT } }; @@ -72,11 +72,17 @@ orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component = { */ static int orte_rmaps_rank_file_open(void) { - mca_rmaps_rank_file_component.priority = 0; + mca_base_component_t *c = &mca_rmaps_rank_file_component.base_version; + + mca_base_param_reg_int(c, "priority", + "Priority of the rank_file rmaps component", + false, false, 0, + &my_priority); if (NULL != orte_rankfile || NULL != orte_rmaps_base.slot_list) { - mca_rmaps_rank_file_component.priority = 100; + /* make us first */ + my_priority = 1000; } return ORTE_SUCCESS; @@ -84,11 +90,7 @@ static int orte_rmaps_rank_file_open(void) static int orte_rmaps_rank_file_query(mca_base_module_t **module, int *priority) { - /* the RMAPS framework is -only- opened on HNP's, - * so no need to check for that here - */ - - *priority = mca_rmaps_rank_file_component.priority; + *priority = my_priority; *module = (mca_base_module_t *)&orte_rmaps_rank_file_module; return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index b967f71532..f6cee9c3a7 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -26,7 +26,6 @@ #include "opal/mca/base/mca_base_param.h" #include "opal/util/argv.h" -#include "opal/util/opal_sos.h" #include "opal/class/opal_pointer_array.h" #include "orte/util/show_help.h" @@ -40,37 +39,486 @@ /* * Local variable */ -static opal_list_item_t *cur_node_item = NULL; - static char *orte_getline(FILE *fp); +static bool have_ftgrps=false; -/* default round-robin mapper */ -static int rr_map_default(orte_job_t *jdata, orte_app_context_t *app, - opal_list_t *node_list, orte_vpid_t num_procs) +static int construct_ftgrps(void); +static int get_ftgrp_target(orte_proc_t *proc, + orte_rmaps_res_ftgrp_t **target, + orte_node_t **nd); +static int get_new_node(orte_proc_t *proc, + orte_app_context_t *app, + orte_job_map_t *map, + orte_node_t **ndret); +static int map_to_ftgrps(orte_job_t *jdata); + +/* + * Loadbalance the cluster + */ +static int orte_rmaps_resilient_map(orte_job_t *jdata) { - int rc; - - /* if a bookmark exists from some prior mapping, set us to start there */ - cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata); - - /* now perform the mapping */ - if (ORTE_MAPPING_BYNODE & jdata->map->policy) { - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_bynode(jdata, app, node_list, - num_procs, cur_node_item))) { - ORTE_ERROR_LOG(rc); - return rc; + orte_app_context_t *app; + int i; + int rc = ORTE_SUCCESS; + orte_node_t *nd=NULL, *oldnode, *node; + orte_rmaps_res_ftgrp_t *target = NULL; + orte_proc_t *proc; + orte_vpid_t totprocs; + opal_list_t node_list; + orte_std_cntr_t num_slots; + opal_list_item_t *item; + + if (0 < jdata->map->mapper && ORTE_RMAPS_RESILIENT != jdata->map->mapper) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:resilient: cannot map job %s - other mapper specified", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + if (ORTE_JOB_STATE_INIT == jdata->state && + NULL == mca_rmaps_resilient_component.fault_group_file) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:resilient: cannot perform initial map of job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:resilient: mapping job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + + /* have we already constructed the fault group list? */ + if (!have_ftgrps) { + construct_ftgrps(); + } + + if (ORTE_JOB_STATE_INIT == jdata->state) { + /* this is an initial map - let the fault group mapper + * handle it + */ + return map_to_ftgrps(jdata); + } + + /* + * NOTE: if a proc is being ADDED to an existing job, then its + * node field will be NULL. + */ + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: remapping job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + + /* cycle through all the procs in this job to find the one(s) that failed */ + for (i=0; i < jdata->procs->size; i++) { + /* get the proc object */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; } - } else { - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_byslot(jdata, app, node_list, - num_procs, cur_node_item))) { + /* is this proc to be restarted? */ + if (proc->state != ORTE_PROC_STATE_RESTART) { + continue; + } + /* save the current node */ + oldnode = proc->node; + /* point to the app */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx); + if( NULL == app ) { + ORTE_ERROR_LOG(ORTE_ERR_FAILED_TO_MAP); + rc = ORTE_ERR_FAILED_TO_MAP; + goto error; + } + + if (NULL == oldnode) { + /* this proc was not previously running - likely it is being added + * to the job. So place it on the node with the fewest procs to + * balance the load + */ + OBJ_CONSTRUCT(&node_list, opal_list_t); + if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, + &num_slots, + app, + jdata->map->policy))) { + ORTE_ERROR_LOG(rc); + goto error; + } + if (0 == opal_list_get_size(&node_list)) { + ORTE_ERROR_LOG(ORTE_ERROR); + rc = ORTE_ERROR; + goto error; + } + totprocs = 1000000; + nd = NULL; + while (NULL != (item = opal_list_remove_first(&node_list))) { + node = (orte_node_t*)item; + if (node->num_procs < totprocs) { + nd = node; + totprocs = node->num_procs; + } + OBJ_RELEASE(item); /* maintain accounting */ + } + OBJ_DESTRUCT(&node_list); + /* we already checked to ensure there was at least one node, + * so we couldn't have come out of the loop with nd=NULL + */ + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: Placing new process on node %s daemon %s (no ftgrp)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + nd->name, ORTE_NAME_PRINT((&nd->daemon->name)))); + } else { + + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: proc %s from node %s is to be restarted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), + (NULL == proc->node) ? "NULL" : proc->node->name)); + + /* if we have fault groups, use them */ + if (have_ftgrps) { + if (ORTE_SUCCESS != (rc = get_ftgrp_target(proc, &target, &nd))) { + ORTE_ERROR_LOG(rc); + goto error; + } + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: placing proc %s into fault group %d node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name)); + } else { + if (ORTE_SUCCESS != (rc = get_new_node(proc, app, jdata->map, &nd))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* + * Put the process on the found node (add it if not already in the map) + */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, + nd, + jdata->map->cpus_per_rank, + proc->app_idx, + NULL, + jdata->map->oversubscribe, + false, + &proc))) { + /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this + * really isn't an error + */ + if (ORTE_ERR_NODE_FULLY_USED != rc) { + ORTE_ERROR_LOG(rc); + goto error; + } + } + + /* flag the proc state as non-launched so we'll know to launch it */ + proc->state = ORTE_PROC_STATE_INIT; + + /* update the node and local ranks so static ports can + * be properly selected if active + */ + orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc); + } + if (!(ORTE_MAPPING_USE_VM & jdata->map->policy)) { + /* define the daemons that we will use for this job */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) { ORTE_ERROR_LOG(rc); return rc; } } + + error: + return rc; +} + +orte_rmaps_base_module_t orte_rmaps_resilient_module = { + orte_rmaps_resilient_map +}; + +static char *orte_getline(FILE *fp) +{ + char *ret, *buff; + char input[1024]; + ret = fgets(input, 1024, fp); + if (NULL != ret) { + input[strlen(input)-1] = '\0'; /* remove newline */ + buff = strdup(input); + return buff; + } + + return NULL; +} + + +static int construct_ftgrps(void) +{ + orte_rmaps_res_ftgrp_t *ftgrp; + orte_node_t *node; + FILE *fp; + char *ftinput; + int grp; + char **nodes; + bool found; + int i, k; + + /* flag that we did this */ + have_ftgrps = true; + + if (NULL == mca_rmaps_resilient_component.fault_group_file) { + /* nothing to build */ + return ORTE_SUCCESS; + } + + /* construct it */ + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: constructing fault groups", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + fp = fopen(mca_rmaps_resilient_component.fault_group_file, "r"); + if (NULL == fp) { /* not found */ + orte_show_help("help-orte-rmaps-resilient.txt", "orte-rmaps-resilient:file-not-found", + true, mca_rmaps_resilient_component.fault_group_file); + return ORTE_ERR_FAILED_TO_MAP; + } + + /* build list of fault groups */ + grp = 0; + while (NULL != (ftinput = orte_getline(fp))) { + ftgrp = OBJ_NEW(orte_rmaps_res_ftgrp_t); + ftgrp->ftgrp = grp++; + nodes = opal_argv_split(ftinput, ','); + /* find the referenced nodes */ + for (k=0; k < opal_argv_count(nodes); k++) { + found = false; + for (i=0; i < orte_node_pool->size && !found; i++) { + if (NULL == (node = opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + if (0 == strcmp(node->name, nodes[k])) { + OBJ_RETAIN(node); + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: adding node %s to fault group %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, ftgrp->ftgrp)); + opal_pointer_array_add(&ftgrp->nodes, node); + found = true; + break; + } + } + } + opal_list_append(&mca_rmaps_resilient_component.fault_grps, &ftgrp->super); + opal_argv_free(nodes); + free(ftinput); + } + fclose(fp); + return ORTE_SUCCESS; } +static int get_ftgrp_target(orte_proc_t *proc, + orte_rmaps_res_ftgrp_t **tgt, + orte_node_t **ndret) +{ + opal_list_item_t *item; + int k, totnodes; + orte_node_t *node, *nd; + orte_rmaps_res_ftgrp_t *target, *ftgrp; + float avgload, minload; + orte_vpid_t totprocs, lowprocs; + + /* set defaults */ + *tgt = NULL; + *ndret = NULL; + + /* flag all the fault groups that + * include this node so we don't reuse them + */ + minload = 1000000.0; + target = NULL; + for (item = opal_list_get_first(&mca_rmaps_resilient_component.fault_grps); + item != opal_list_get_end(&mca_rmaps_resilient_component.fault_grps); + item = opal_list_get_next(item)) { + ftgrp = (orte_rmaps_res_ftgrp_t*)item; + /* see if the node is in this fault group */ + ftgrp->included = true; + ftgrp->used = false; + for (k=0; k < ftgrp->nodes.size; k++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) { + continue; + } + if (NULL != proc->node && 0 == strcmp(node->name, proc->node->name)) { + /* yes - mark it to not be included */ + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: node %s is in fault group %d, which will be excluded", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + proc->node->name, ftgrp->ftgrp)); + ftgrp->included = false; + break; + } + } + /* if this ftgrp is not included, then skip it */ + if (!ftgrp->included) { + continue; + } + /* compute the load average on this fault group */ + totprocs = 0; + totnodes = 0; + for (k=0; k < ftgrp->nodes.size; k++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) { + continue; + } + totnodes++; + totprocs += node->num_procs; + } + avgload = (float)totprocs / (float)totnodes; + /* now find the lightest loaded of the included fault groups */ + if (avgload < minload) { + minload = avgload; + target = ftgrp; + OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: found new min load ftgrp %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ftgrp->ftgrp)); + } + } + + if (NULL == target) { + /* nothing found */ + return ORTE_ERR_NOT_FOUND; + } + + /* if we did find a target, re-map the proc to the lightest loaded + * node in that group + */ + lowprocs = 1000000; + nd = NULL; + for (k=0; k < target->nodes.size; k++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&target->nodes, k))) { + continue; + } + if (node->num_procs < lowprocs) { + lowprocs = node->num_procs; + nd = node; + } + } + + /* return the results */ + *tgt = target; + *ndret = nd; + + return ORTE_SUCCESS; +} + +static int get_new_node(orte_proc_t *proc, + orte_app_context_t *app, + orte_job_map_t *map, + orte_node_t **ndret) +{ + orte_node_t *nd, *oldnode, *node; + int rc; + orte_vpid_t totprocs; + opal_list_t node_list; + opal_list_item_t *item; + orte_std_cntr_t num_slots; + + /* if no ftgrps are available, then just map it on the lightest loaded + * node known to the system, avoiding the current node if possible and + * taking into account any limitations specified by user in hostfile + * and -host options + */ + *ndret = NULL; + nd = NULL; + oldnode = proc->node; + + /* + * Get a list of all nodes + */ + OBJ_CONSTRUCT(&node_list, opal_list_t); + if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, + &num_slots, + app, + map->policy))) { + ORTE_ERROR_LOG(rc); + goto error; + } + if (0 == opal_list_get_size(&node_list)) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + goto error; + } + + /* + * Cycle thru the list to find the current node and remove it + * + */ + nd = NULL; + for (item = opal_list_get_first(&node_list); + item != opal_list_get_end(&node_list); + item = opal_list_get_next(item)) { + node = (orte_node_t*)item; + OPAL_OUTPUT_VERBOSE((7, orte_rmaps_base.rmaps_output, + "%s CHECKING NODE %s[%s] AGAINST NODE %s[%s]", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, + (NULL == node->daemon) ? "?" : ORTE_VPID_PRINT(node->daemon->name.vpid), + oldnode->name, + (NULL == oldnode->daemon) ? "?" : ORTE_VPID_PRINT(oldnode->daemon->name.vpid))); + if (node == oldnode) { + /* remove it from the list */ + opal_list_remove_item(&node_list, item); + /* maintain acctg */ + OBJ_RELEASE(item); + break; + } + } + + /* if the list is empty, then go ahead and install it back on + * its original node as this is better than not restarting + * at all + */ + if (0 == opal_list_get_size(&node_list)) { + nd = oldnode; + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: Placing process back on original node %s daemon %s (no ftgrp)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + nd->name, ORTE_NAME_PRINT((&nd->daemon->name)))); + } else { + /* put it on the lightest loaded node on the list */ + totprocs = 1000000; + nd = NULL; + for (item = opal_list_get_first(&node_list); + item != opal_list_get_end(&node_list); + item = opal_list_get_next(item)) { + node = (orte_node_t*)item; + if (node->num_procs < totprocs) { + nd = node; + totprocs = node->num_procs; + } + } + if (NULL == nd) { + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s ERROR - NO NODES AVAILABLE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* this can only happen if no nodes are available - quietly return */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + goto error; + } + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: Placing process on node %s daemon %s (no ftgrp)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (nd == oldnode) ? "OLDNODE" : nd->name, + ORTE_NAME_PRINT((&nd->daemon->name)))); + } + + error: + while (NULL != (item = opal_list_remove_first(&node_list))) { + OBJ_RELEASE(item); + } + OBJ_DESTRUCT(&node_list); + + *ndret = nd; + return rc; +} + static void flag_nodes(opal_list_t *node_list) { opal_list_item_t *item, *nitem; @@ -105,309 +553,21 @@ static void flag_nodes(opal_list_t *node_list) } } - -/* - * Loadbalance the cluster - */ -static int orte_rmaps_resilient_map(orte_job_t *jdata) +static int map_to_ftgrps(orte_job_t *jdata) { orte_job_map_t *map; orte_app_context_t *app; int i, j, k, totnodes; opal_list_t node_list; - opal_list_item_t *item; + opal_list_item_t *item, *next, *curitem; orte_std_cntr_t num_slots; int rc = ORTE_SUCCESS; float avgload, minload; - orte_node_t *node, *nd=NULL, *oldnode; + orte_node_t *node, *nd=NULL; orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL; - orte_vpid_t totprocs, lowprocs, num_assigned; - FILE *fp; - char *ftinput; - int grp; - char **nodes; - bool found; + orte_vpid_t totprocs, num_assigned; orte_proc_t *proc; - - /* have we already constructed the fault group list? */ - if (0 == opal_list_get_size(&mca_rmaps_resilient_component.fault_grps) && - NULL != mca_rmaps_resilient_component.fault_group_file) { - /* construct it */ - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: constructing fault groups", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - fp = fopen(mca_rmaps_resilient_component.fault_group_file, "r"); - if (NULL == fp) { /* not found */ - orte_show_help("help-orte-rmaps-resilient.txt", "orte-rmaps-resilient:file-not-found", - true, mca_rmaps_resilient_component.fault_group_file); - return ORTE_ERR_SILENT; - } - /* build list of fault groups */ - grp = 0; - while (NULL != (ftinput = orte_getline(fp))) { - ftgrp = OBJ_NEW(orte_rmaps_res_ftgrp_t); - ftgrp->ftgrp = grp++; - nodes = opal_argv_split(ftinput, ','); - /* find the referenced nodes */ - for (k=0; k < opal_argv_count(nodes); k++) { - found = false; - for (i=0; i < orte_node_pool->size && !found; i++) { - if (NULL == (node = opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - if (0 == strcmp(node->name, nodes[k])) { - OBJ_RETAIN(node); - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: adding node %s to fault group %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, ftgrp->ftgrp)); - opal_pointer_array_add(&ftgrp->nodes, node); - found = true; - break; - } - } - } - opal_list_append(&mca_rmaps_resilient_component.fault_grps, &ftgrp->super); - opal_argv_free(nodes); - free(ftinput); - } - fclose(fp); - } - - /* the map will never be NULL as we initialize it before getting here, - * so check to see if the job state is RESTART - * - * NOTE: if a proc is being ADDED to an existing job, then its - * node field will be NULL. - */ - if (ORTE_JOB_STATE_RESTART == jdata->state) { - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: remapping job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid))); - /* cycle through all the procs in this job to find the one(s) that failed */ - for (i=0; i < jdata->procs->size; i++) { - /* get the proc object */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { - continue; - } - /* is this proc to be restarted? */ - if (proc->state != ORTE_PROC_STATE_RESTART) { - continue; - } - /* save the current node */ - oldnode = proc->node; - /* point to the app */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx); - if( NULL == app ) { - ORTE_ERROR_LOG(ORTE_ERROR); - rc = ORTE_ERROR; - goto error; - } - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: proc %s from node %s is to be restarted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), - (NULL == proc->node) ? "NULL" : proc->node->name)); - /* if we have fault groups, flag all the fault groups that - * include this node so we don't reuse them - */ - target = NULL; - minload = 1000000.0; - for (item = opal_list_get_first(&mca_rmaps_resilient_component.fault_grps); - item != opal_list_get_end(&mca_rmaps_resilient_component.fault_grps); - item = opal_list_get_next(item)) { - ftgrp = (orte_rmaps_res_ftgrp_t*)item; - /* see if the node is in this fault group */ - ftgrp->included = true; - ftgrp->used = false; - for (k=0; k < ftgrp->nodes.size; k++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) { - continue; - } - if (NULL != proc->node && 0 == strcmp(node->name, proc->node->name)) { - /* yes - mark it to not be included */ - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: node %s is in fault group %d, which will be excluded", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - proc->node->name, ftgrp->ftgrp)); - ftgrp->included = false; - break; - } - } - /* if this ftgrp is not included, then skip it */ - if (!ftgrp->included) { - continue; - } - /* compute the load average on this fault group */ - totprocs = 0; - totnodes = 0; - for (k=0; k < ftgrp->nodes.size; k++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) { - continue; - } - totnodes++; - totprocs += node->num_procs; - } - avgload = (float)totprocs / (float)totnodes; - /* now find the lightest loaded of the included fault groups */ - if (avgload < minload) { - minload = avgload; - target = ftgrp; - OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: found new min load ftgrp %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ftgrp->ftgrp)); - } - } - /* if no ftgrps are available, then just map it on the lightest loaded - * node known to the system, avoiding the current node if possible and - * taking into account any limitations specified by user in hostfile - * and -host options - */ - if (NULL == target) { - nd = NULL; - - /* - * Get a list of all nodes - */ - OBJ_CONSTRUCT(&node_list, opal_list_t); - map = jdata->map; - if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, - &num_slots, - app, - map->policy))) { - ORTE_ERROR_LOG(rc); - goto error; - } - - /* Ask the ErrMgr components if they have a suggestion for this process */ - orte_errmgr.suggest_map_targets(proc, proc->node, &node_list); - - nd = (orte_node_t*)opal_list_get_first(&node_list); - if( NULL == nd ) { - ORTE_ERROR_LOG(ORTE_ERROR); - rc = ORTE_ERROR; - goto error; - } - - /* - * Look though the list for the least loaded machine. - */ - nd = oldnode; /* Put it back where it was if nothing else is found */ - totprocs = 1000000; - found = false; - /* find the lightest loaded node while deconstructing the list */ - while (NULL != (item = opal_list_remove_first(&node_list))) { - node = (orte_node_t*)item; - if( !found ) { - if( ((int)node->num_procs) < orte_rmaps_base.npernode ) { - nd = node; - totprocs = 0; - found = true; - } - else if( node->num_procs < totprocs) { - nd = node; - totprocs = node->num_procs; - } - } - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&node_list); - - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: Placing process on node %s (no ftgrp)", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - nd->name)); - - /* - * Put the process on the found node (add it if not already in the map) - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, - nd, - jdata->map->cpus_per_rank, - proc->app_idx, - NULL, - jdata->map->oversubscribe, - false, - &proc))) { - /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this - * really isn't an error - */ - if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) { - ORTE_ERROR_LOG(rc); - goto error; - } - } - - /* flag the proc state as non-launched so we'll know to launch it */ - proc->state = ORTE_PROC_STATE_INIT; - - /* update the node and local ranks so static ports can - * be properly selected if active - */ - orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc); - - continue; - } - /* if we did find a target, re-map the proc to the lightest loaded - * node in that group - */ - lowprocs = 1000000; - nd = NULL; - for (k=0; k < target->nodes.size; k++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&target->nodes, k))) { - continue; - } - if (node->num_procs < lowprocs) { - lowprocs = node->num_procs; - nd = node; - } - } - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: placing proc %s into fault group %d node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name)); - if (NULL != proc->node) { - OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */ - } - /* put proc on the found node */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx, - NULL, jdata->map->oversubscribe, false, &proc))) { - /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this - * really isn't an error - */ - if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) { - ORTE_ERROR_LOG(rc); - goto error; - } - } - /* flag the proc state as non-launched so we'll know to launch it */ - proc->state = ORTE_PROC_STATE_INIT; - /* update the node and local ranks so static ports can - * be properly selected if active - */ - orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc); - } - /* define the daemons that we will use for this job */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return ORTE_SUCCESS; - } - - - /* CREATE INITIAL MAP FOR A JOB */ - /* we map each app_context separately when creating an initial job map. For - * each app_context, we get the list of available nodes as this can be - * app_context specific based on hostfile and -host options. We then organize - * that list into fault groups based on the fault group definitions, if - * provided, and then divide the specified number of copies across them in - * a load-balanced way - */ - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, "%s rmaps:resilient: creating initial map for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -438,76 +598,87 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) */ OBJ_CONSTRUCT(&node_list, opal_list_t); if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, - map->policy))) { + map->policy))) { ORTE_ERROR_LOG(rc); - goto error; + return rc; } - /* were we given a fault group definition? */ - if (0 < opal_list_get_size(&mca_rmaps_resilient_component.fault_grps)) { - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: using fault groups", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* flag the fault groups included by these nodes */ - flag_nodes(&node_list); - /* map each copy to a different fault group - if more copies are - * specified than fault groups, then overlap in a round-robin fashion + /* remove all nodes that are not "up" or do not have a running daemon on them */ + item = opal_list_get_first(&node_list); + while (item != opal_list_get_end(&node_list)) { + next = opal_list_get_next(item); + node = (orte_node_t*)item; + if (ORTE_NODE_STATE_UP != node->state || + NULL == node->daemon || + ORTE_PROC_STATE_RUNNING != node->daemon->state) { + opal_list_remove_item(&node_list, item); + OBJ_RELEASE(item); + } + item = next; + } + curitem = opal_list_get_first(&node_list); + + /* flag the fault groups included by these nodes */ + flag_nodes(&node_list); + /* map each copy to a different fault group - if more copies are + * specified than fault groups, then overlap in a round-robin fashion + */ + for (j=0; j < app->num_procs; j++) { + /* find unused included fault group with lowest average load - if none + * found, then break */ - for (j=0; j < app->num_procs; j++) { - /* find unused included fault group with lowest average load - if none - * found, then break + target = NULL; + minload = 1000000000.0; + for (item = opal_list_get_first(&mca_rmaps_resilient_component.fault_grps); + item != opal_list_get_end(&mca_rmaps_resilient_component.fault_grps); + item = opal_list_get_next(item)) { + ftgrp = (orte_rmaps_res_ftgrp_t*)item; + OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: fault group %d used: %s included %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ftgrp->ftgrp, + ftgrp->used ? "YES" : "NO", + ftgrp->included ? "YES" : "NO" )); + /* if this ftgrp has already been used or is not included, then + * skip it */ - target = NULL; - minload = 1000000000.0; - for (item = opal_list_get_first(&mca_rmaps_resilient_component.fault_grps); - item != opal_list_get_end(&mca_rmaps_resilient_component.fault_grps); - item = opal_list_get_next(item)) { - ftgrp = (orte_rmaps_res_ftgrp_t*)item; - OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: fault group %d used: %s included %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ftgrp->ftgrp, - ftgrp->used ? "YES" : "NO", - ftgrp->included ? "YES" : "NO" )); - /* if this ftgrp has already been used or is not included, then - * skip it - */ - if (ftgrp->used || !ftgrp->included) { + if (ftgrp->used || !ftgrp->included) { + continue; + } + /* compute the load average on this fault group */ + totprocs = 0; + totnodes = 0; + for (k=0; k < ftgrp->nodes.size; k++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) { continue; } - /* compute the load average on this fault group */ - totprocs = 0; - totnodes = 0; - for (k=0; k < ftgrp->nodes.size; k++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) { - continue; - } - totnodes++; - totprocs += node->num_procs; - } - avgload = (float)totprocs / (float)totnodes; - if (avgload < minload) { - minload = avgload; - target = ftgrp; - OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: found new min load ftgrp %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ftgrp->ftgrp)); - } + totnodes++; + totprocs += node->num_procs; } - /* if we have more procs than fault groups, then we simply - * map the remaining procs on available nodes in a round-robin - * fashion - it doesn't matter where they go as they will not - * be contributing to fault tolerance by definition - */ - if (NULL == target) { + avgload = (float)totprocs / (float)totnodes; + if (avgload < minload) { + minload = avgload; + target = ftgrp; OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: no available fault group - mapping rr", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-num_assigned))) { - goto error; - } - goto cleanup; + "%s rmaps:resilient: found new min load ftgrp %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ftgrp->ftgrp)); } + } + /* if we have more procs than fault groups, then we simply + * map the remaining procs on available nodes in a round-robin + * fashion - it doesn't matter where they go as they will not + * be contributing to fault tolerance by definition + */ + if (NULL == target) { + OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: more procs than fault groups - mapping excess rr", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + nd = (orte_node_t*)curitem; + curitem = opal_list_get_next(curitem); + if (curitem == opal_list_get_end(&node_list)) { + curitem = opal_list_get_first(&node_list); + } + } else { /* pick node with lowest load from within that group */ totprocs = 1000000; for (k=0; k < target->nodes.size; k++) { @@ -519,40 +690,35 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) nd = node; } } - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: placing proc into fault group %d node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - target->ftgrp, nd->name)); - /* put proc on that node */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx, - &node_list, jdata->map->oversubscribe, false, NULL))) { - /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this - * really isn't an error - */ - if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) { - ORTE_ERROR_LOG(rc); - goto error; - } + } + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, + "%s rmaps:resilient: placing proc into fault group %d node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == target) ? -1 : target->ftgrp, nd->name)); + /* put proc on that node */ + proc=NULL; + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx, + &node_list, jdata->map->oversubscribe, false, &proc))) { + /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this + * really isn't an error + */ + if (ORTE_ERR_NODE_FULLY_USED != rc) { + ORTE_ERROR_LOG(rc); + return rc; } - /* track number of procs mapped */ - num_assigned++; + } + /* flag the proc as ready for launch */ + proc->state = ORTE_PROC_STATE_INIT; + + /* track number of procs mapped */ + num_assigned++; - /* flag this fault group as used */ + /* flag this fault group as used */ + if (NULL != target) { target->used = true; } - } else { - /* if we don't have a fault group definition, then just map the - * procs in a round-robin manner - */ - OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, - "%s rmaps:resilient: no fault groups provided - mapping rr", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs))) { - goto error; - } } - - cleanup: + /* track number of procs */ jdata->num_procs += app->num_procs; @@ -573,46 +739,20 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) } OBJ_DESTRUCT(&node_list); } - + /* compute and save local ranks */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); return rc; } - /* define the daemons that we will use for this job */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) { - ORTE_ERROR_LOG(rc); - return rc; + if (!(ORTE_MAPPING_USE_VM & jdata->map->policy)) { + /* define the daemons that we will use for this job */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) { + ORTE_ERROR_LOG(rc); + return rc; + } } return ORTE_SUCCESS; - -error: - while (NULL != (item = opal_list_remove_first(&node_list))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&node_list); - - return rc; } - -orte_rmaps_base_module_t orte_rmaps_resilient_module = { - orte_rmaps_resilient_map -}; - -static char *orte_getline(FILE *fp) -{ - char *ret, *buff; - char input[1024]; - - ret = fgets(input, 1024, fp); - if (NULL != ret) { - input[strlen(input)-1] = '\0'; /* remove newline */ - buff = strdup(input); - return buff; - } - - return NULL; -} - diff --git a/orte/mca/rmaps/resilient/rmaps_resilient_component.c b/orte/mca/rmaps/resilient/rmaps_resilient_component.c index 9fda19940a..67b03b644a 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient_component.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient_component.c @@ -37,6 +37,8 @@ static int orte_rmaps_resilient_open(void); static int orte_rmaps_resilient_close(void); static int orte_rmaps_resilient_query(mca_base_module_t **module, int *priority); +static int my_priority; + orte_rmaps_res_component_t mca_rmaps_resilient_component = { { { @@ -72,21 +74,25 @@ static int orte_rmaps_resilient_open(void) mca_base_param_reg_string(c, "fault_grp_file", "Filename that contains a description of fault groups for this system", false, false, NULL, &mca_rmaps_resilient_component.fault_group_file); - + + mca_base_param_reg_int(c, "priority", + "Priority of the resilient rmaps component", + false, false, 40, + &my_priority); return ORTE_SUCCESS; } static int orte_rmaps_resilient_query(mca_base_module_t **module, int *priority) -{ - *priority = 0; /* select only if specified */ +{ + *priority = my_priority; *module = (mca_base_module_t *)&orte_rmaps_resilient_module; - /* if a fault group file was provided, we definitely want to be selected */ + /* if a fault group file was provided, we should be first */ if (NULL != mca_rmaps_resilient_component.fault_group_file) { *priority = 1000; } - + return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/rmaps_types.h b/orte/mca/rmaps/rmaps_types.h index 25fb3cb87c..f5bfed739f 100644 --- a/orte/mca/rmaps/rmaps_types.h +++ b/orte/mca/rmaps/rmaps_types.h @@ -33,6 +33,16 @@ BEGIN_C_DECLS +/* enumerate selectable mappers */ +enum { + ORTE_RMAPS_UNDEF, + ORTE_RMAPS_RR, + ORTE_RMAPS_LOADBALANCE, + ORTE_RMAPS_SEQ, + ORTE_RMAPS_RF, + ORTE_RMAPS_RESILIENT +}; + /* * Structure that represents the mapping of a job to an * allocated set of resources. @@ -40,6 +50,7 @@ BEGIN_C_DECLS struct orte_job_map_t { opal_object_t super; /* user-specified mapping params */ + int32_t mapper; orte_mapping_policy_t policy; int npernode; int nperboard; diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 942b005828..4cbe20ae93 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -34,6 +34,7 @@ #include "orte/util/show_help.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/util/error_strings.h" #include "orte/mca/rmaps/base/rmaps_private.h" #include "orte/mca/rmaps/base/base.h" @@ -53,6 +54,29 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) int rc; opal_list_item_t *cur_node_item; + /* this mapper can only handle initial launch + * when rr mapping is desired - allow + * restarting of failed apps + */ + if (ORTE_JOB_STATE_INIT != jdata->state) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:rr: not job %s in state %s - rr cannot map", + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jdata->state)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + if (0 < jdata->map->mapper && ORTE_RMAPS_RR != jdata->map->mapper) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:rr: job %s not using rr mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:rr: mapping job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + /* start at the beginning... */ jdata->num_procs = 0; @@ -88,7 +112,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) /* if a bookmark exists from some prior mapping, set us to start there */ cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata); - if (0 == app->num_procs) { + if (0 == app->num_procs) { /* set the num_procs to equal the number of slots on these mapped nodes */ app->num_procs = num_slots; } @@ -138,7 +162,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return ORTE_SUCCESS; -error: + error: while(NULL != (item = opal_list_remove_first(&node_list))) { OBJ_RELEASE(item); } diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_component.c b/orte/mca/rmaps/round_robin/rmaps_rr_component.c index 2698ae55fb..05d3cc94a2 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_component.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_component.c @@ -33,6 +33,7 @@ static int orte_rmaps_round_robin_open(void); static int orte_rmaps_round_robin_close(void); static int orte_rmaps_round_robin_query(mca_base_module_t **module, int *priority); +static int my_priority; orte_rmaps_base_component_t mca_rmaps_round_robin_component = { { @@ -58,6 +59,12 @@ orte_rmaps_base_component_t mca_rmaps_round_robin_component = { */ static int orte_rmaps_round_robin_open(void) { + mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version; + + mca_base_param_reg_int(c, "priority", + "Priority of the rr rmaps component", + false, false, 100, + &my_priority); return ORTE_SUCCESS; } @@ -68,7 +75,7 @@ static int orte_rmaps_round_robin_query(mca_base_module_t **module, int *priorit * so no need to check for that here */ - *priority = 70; /* this is the default mapper */ + *priority = my_priority; *module = (mca_base_module_t *)&orte_rmaps_round_robin_module; return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/seq/rmaps_seq.c b/orte/mca/rmaps/seq/rmaps_seq.c index e9bb8a11cb..7631d1ea2b 100644 --- a/orte/mca/rmaps/seq/rmaps_seq.c +++ b/orte/mca/rmaps/seq/rmaps_seq.c @@ -76,6 +76,28 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); + /* this mapper can only handle initial launch + * when seq mapping is desired - allow + * restarting of failed apps + */ + if (ORTE_JOB_STATE_INIT != jdata->state) { + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:seq: not job %s not in initial state - seq cannot map", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + if (0 < jdata->map->mapper && ORTE_RMAPS_SEQ != jdata->map->mapper) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:seq: job %s not using sequential mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + opal_output_verbose(5, orte_rmaps_base.rmaps_output, + "mca:rmaps:seq: mapping job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* conveniece def */ map = jdata->map; diff --git a/orte/mca/rmaps/seq/rmaps_seq_component.c b/orte/mca/rmaps/seq/rmaps_seq_component.c index 675be77133..baec6b39aa 100644 --- a/orte/mca/rmaps/seq/rmaps_seq_component.c +++ b/orte/mca/rmaps/seq/rmaps_seq_component.c @@ -33,6 +33,7 @@ static int orte_rmaps_seq_open(void); static int orte_rmaps_seq_close(void); static int orte_rmaps_seq_query(mca_base_module_t **module, int *priority); +static int my_priority; orte_rmaps_base_component_t mca_rmaps_seq_component = { { @@ -58,17 +59,19 @@ orte_rmaps_base_component_t mca_rmaps_seq_component = { */ static int orte_rmaps_seq_open(void) { + mca_base_component_t *c = &mca_rmaps_seq_component.base_version; + + mca_base_param_reg_int(c, "priority", + "Priority of the seq rmaps component", + false, false, 60, + &my_priority); return ORTE_SUCCESS; } static int orte_rmaps_seq_query(mca_base_module_t **module, int *priority) { - /* the RMAPS framework is -only- opened on HNP's, - * so no need to check for that here - */ - - *priority = 0; /* only select if specified */ + *priority = my_priority; *module = (mca_base_module_t *)&orte_rmaps_seq_module; return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/topo/.windows b/orte/mca/rmaps/topo/.windows deleted file mode 100644 index aa7d7bbbe5..0000000000 --- a/orte/mca/rmaps/topo/.windows +++ /dev/null @@ -1,12 +0,0 @@ -# -# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Specific to this module -mca_link_libraries=libopen-rte diff --git a/orte/mca/rmaps/topo/Makefile.am b/orte/mca/rmaps/topo/Makefile.am deleted file mode 100644 index 2f33b20cdb..0000000000 --- a/orte/mca/rmaps/topo/Makefile.am +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -dist_pkgdata_DATA = help-orte-rmaps-topo.txt - -sources = \ - rmaps_topo.c \ - rmaps_topo.h \ - rmaps_topo_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_rmaps_topo_DSO -component_noinst = -component_install = mca_rmaps_topo.la -else -component_noinst = libmca_rmaps_topo.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_rmaps_topo_la_SOURCES = $(sources) -mca_rmaps_topo_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_rmaps_topo_la_SOURCES =$(sources) -libmca_rmaps_topo_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rmaps/topo/help-orte-rmaps-topo.txt b/orte/mca/rmaps/topo/help-orte-rmaps-topo.txt deleted file mode 100644 index dd660389ac..0000000000 --- a/orte/mca/rmaps/topo/help-orte-rmaps-topo.txt +++ /dev/null @@ -1,53 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for Open RTE's orterun. -# -[orte-rmaps-topo:alloc-error] -There are not enough slots available in the system to satisfy the %d slots -that were requested by the application: - %s - -Either request fewer slots for your application, or make more slots available -for use. -[orte-rmaps-topo:multi-apps-and-zero-np] -RMAPS found multiple applications to be launched, with -at least one that failed to specify the number of processes to execute. -When specifying multiple applications, you must specify how many processes -of each to launch via the -np argument. - -[orte-rmaps-topo:per-node-and-too-many-procs] -There are not enough nodes in your allocation to satisfy your request to launch -%d processes on a per-node basis - only %d nodes were available. - -Either request fewer processes, or obtain a larger allocation. -[orte-rmaps-topo:n-per-node-and-too-many-procs] -There are not enough nodes in your allocation to satisfy your request to launch -%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available. - -Either request fewer processes, or obtain a larger allocation. -[orte-rmaps-topo:n-per-node-and-not-enough-slots] -There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available. - -Either request fewer processes/node, or obtain a larger allocation. - -[orte-rmaps-topo:no-np-and-user-map] -You have specified a rank-to-node/slot mapping, but failed to provide -the number of processes to be executed. For some reason, this information -could not be obtained from the mapping you provided, so we cannot continue -with executing the specified application. diff --git a/orte/mca/rmaps/topo/rmaps_topo.c b/orte/mca/rmaps/topo/rmaps_topo.c deleted file mode 100644 index aa468538c5..0000000000 --- a/orte/mca/rmaps/topo/rmaps_topo.c +++ /dev/null @@ -1,546 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#ifdef HAVE_STRING_H -#include -#endif /* HAVE_STRING_H */ - -#include "opal/mca/base/mca_base_param.h" -#include "opal/util/trace.h" -#include "opal/util/opal_sos.h" -#include "opal/mca/carto/base/base.h" - -#include "orte/util/show_help.h" -#include "orte/mca/errmgr/errmgr.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" -#include "rmaps_topo.h" - -static int topo_map(orte_job_t *jdata); - -orte_rmaps_base_module_t orte_rmaps_topo_module = { - topo_map -}; - -/* - * Local variable - */ -static opal_list_item_t *cur_node_item = NULL; -static int ppn = 0; - -/* - * Create a default mapping for the application, scheduling round - * robin by node. - */ -static int map_app_by_node( - orte_app_context_t* app, - orte_job_t* jdata, - orte_vpid_t vpid_start, - opal_list_t* nodes) -{ - int rc = ORTE_SUCCESS; - opal_list_item_t *next; - orte_node_t *node; - orte_std_cntr_t num_alloc=0; - - OPAL_TRACE(2); - - /* This loop continues until all procs have been mapped or we run - out of resources. We determine that we have "run out of - resources" when all nodes have slots_max processes mapped to them, - thus there are no free slots for a process to be mapped, or we have - hit the soft limit on all nodes and are in a "no oversubscribe" state. - If we still have processes that haven't been mapped yet, then it's an - "out of resources" error. - - In this scenario, we rely on the claim_slot function to handle the - oversubscribed case. The claim_slot function will leave a node on the - list until it either reaches slots_max OR reaches the - soft limit and the "no_oversubscribe" flag has been set - at which point, - the node will be removed to prevent any more processes from being mapped to - it. Since we are taking one slot from each node as we cycle through, the - list, oversubscription is automatically taken care of via this logic. - */ - - while (num_alloc < app->num_procs) { - - /** see if any nodes remain unused and available. We need to do this check - * each time since we may remove nodes from the list (as they become fully - * used) as we cycle through the loop */ - if(0 >= opal_list_get_size(nodes) ) { - /* No more nodes to allocate :( */ - orte_show_help("help-orte-rmaps-topo.txt", "orte-rmaps-topo:alloc-error", - true, app->num_procs, app->app); - return ORTE_ERR_SILENT; - } - - /* Save the next node we can use before claiming slots, since - * we may need to prune the nodes list removing overused nodes. - * Wrap around to beginning if we are at the end of the list */ - if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) { - next = opal_list_get_first(nodes); - } - else { - next = opal_list_get_next(cur_node_item); - } - - /* Allocate a slot on this node */ - node = (orte_node_t*) cur_node_item; - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx, - nodes, jdata->map->oversubscribe, true, NULL))) { - /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this - * really isn't an error - we just need to break from the loop - * since the node is fully used up. For now, just don't report - * an error - */ - if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - - ++num_alloc; - - cur_node_item = next; - } - - return ORTE_SUCCESS; -} - - -/* - * Create a default mapping for the application, scheduling one round - * robin by slot. - */ -static int map_app_by_slot( - orte_app_context_t* app, - orte_job_t* jdata, - orte_vpid_t vpid_start, - opal_list_t* nodes) -{ - int rc = ORTE_SUCCESS; - orte_std_cntr_t i, num_slots_to_take; - orte_node_t *node; - opal_list_item_t *next; - orte_std_cntr_t num_alloc=0; - - OPAL_TRACE(2); - - /* This loop continues until all procs have been mapped or we run - out of resources. We determine that we have "run out of - resources" when either all nodes have slots_max processes mapped to them, - (thus there are no free slots for a process to be mapped), OR all nodes - have reached their soft limit and the user directed us to "no oversubscribe". - If we still have processes that haven't been mapped yet, then it's an - "out of resources" error. */ - - while ( num_alloc < app->num_procs) { - - /** see if any nodes remain unused and available. We need to do this check - * each time since we may remove nodes from the list (as they become fully - * used) as we cycle through the loop */ - if(0 >= opal_list_get_size(nodes) ) { - /* Everything is at max usage! :( */ - orte_show_help("help-orte-rmaps-topo.txt", "orte-rmaps-topo:alloc-error", - true, app->num_procs, app->app); - return ORTE_ERR_SILENT; - } - - /* Save the next node we can use before claiming slots, since - * we may need to prune the nodes list removing overused nodes. - * Wrap around to beginning if we are at the end of the list */ - if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) { - next = opal_list_get_first(nodes); - } - else { - next = opal_list_get_next(cur_node_item); - } - - /** declare a shorter name for convenience in the code below */ - node = (orte_node_t*) cur_node_item; - - /* If we have available slots on this node, claim all of them - * If node_slots == 0, assume 1 slot for that node. - * JJH - is this assumption fully justified? - * - * If we are now oversubscribing the nodes, then we still take: - * (a) if the node has not been used yet, we take a full node_slots - * (b) if some of the slots are in-use, then we take the number of - * remaining slots before hitting the soft limit (node_slots) - * (c) if we are at or above the soft limit, we take a full node_slots - * - * Note: if node_slots is zero, then we always just take 1 slot - * - * We continue this process until either everything is done, - * or all nodes have hit their hard limit. This algorithm ensures we - * fully utilize each node before oversubscribing, and preserves the ratio - * of processes between the nodes thereafter (e.g., if one node has twice as - * many processes as another before oversubscribing, it will continue - * to do so after oversubscribing). - */ - if (0 == node->slots_inuse || - node->slots_inuse >= node->slots) { - num_slots_to_take = (node->slots == 0) ? 1 : node->slots; - } else { - num_slots_to_take = node->slots - node->slots_inuse; - } - - /* check if we are in npernode mode - if so, then set the num_slots_to_take - * to the num_per_node - */ - if (0 < jdata->map->npernode) { - num_slots_to_take = jdata->map->npernode; - } - - for( i = 0; i < num_slots_to_take; ++i) { - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx, - nodes, jdata->map->oversubscribe, true, NULL))) { - /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this - * really isn't an error - we just need to break from the loop - * since the node is fully used up. For now, just don't report - * an error - */ - if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - - /* Update the number of procs allocated */ - ++num_alloc; - - /** if all the procs have been mapped, we return */ - if (num_alloc == app->num_procs) { - return ORTE_SUCCESS; - } - - /* if we have fully used up this node - * OR we are at our ppn and loadbalancing, then break from the loop - */ - if (ORTE_ERR_NODE_FULLY_USED == OPAL_SOS_GET_ERROR_CODE(rc) || - (orte_rmaps_base.loadbalance && (int)node->num_procs >= ppn)) { - break; - } - } - - /* we move on to the next node in all cases EXCEPT if we came - * out of the loop without having taken a full bite AND the - * node is NOT max'd out - * - */ - if (i < (num_slots_to_take-1) && - ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) && - (orte_rmaps_base.loadbalance && (int)node->num_procs < ppn)) { - continue; - } - cur_node_item = next; - } - - return ORTE_SUCCESS; -} - - -/* - * Create a topo-aware mapping for the job. - */ -static int topo_map(orte_job_t *jdata) -{ - orte_job_map_t *map; - orte_app_context_t *app; - int i; - opal_list_t node_list; - opal_list_item_t *item; - orte_node_t *node, *nd1; - orte_vpid_t vpid_start; - orte_std_cntr_t num_nodes, num_slots; - int rc; - orte_std_cntr_t slots_per_node; - opal_carto_graph_t *graph; - opal_carto_base_node_t *crnode; - opal_value_array_t distance; - - OPAL_TRACE(1); - - /* conveniece def */ - map = jdata->map; - - /* start at the beginning... */ - vpid_start = 0; - jdata->num_procs = 0; - - /* get the graph of nodes */ - if (ORTE_SUCCESS != (rc = opal_carto_base_get_host_graph(&graph, "SLOT"))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* cycle through the app_contexts, mapping them sequentially */ - for(i=0; i < jdata->apps->size; i++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { - continue; - } - - /* if the number of processes wasn't specified, then we know there can be only - * one app_context allowed in the launch, and that we are to launch it across - * all available slots. We'll double-check the single app_context rule first - */ - if (0 == app->num_procs && 1 < jdata->num_apps) { - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np", - true, jdata->num_apps, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - - /* for each app_context, we have to get the list of nodes that it can - * use since that can now be modified with a hostfile and/or -host - * option - */ - OBJ_CONSTRUCT(&node_list, opal_list_t); - if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, - map->policy))) { - ORTE_ERROR_LOG(rc); - goto error; - } - num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list); - - /* if a bookmark exists from some prior mapping, set us to start there */ - if (NULL != jdata->bookmark) { - cur_node_item = NULL; - /* find this node on the list */ - for (item = opal_list_get_first(&node_list); - item != opal_list_get_end(&node_list); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - - if (node->index == jdata->bookmark->index) { - cur_node_item = item; - break; - } - } - /* see if we found it - if not, just start at the beginning */ - if (NULL == cur_node_item) { - cur_node_item = opal_list_get_first(&node_list); - } - } else { - /* if no bookmark, then just start at the beginning of the list */ - cur_node_item = opal_list_get_first(&node_list); - } - - /* order this list by network nearness - i.e., the next item in the - * list should be the node that is closest [in a network sense] to - * the prior item in the list - * - * RHC: start the list with the bookmark nodeas this is where - * we would start mapping - */ - node = (orte_node_t*)cur_node_item; - if (NULL == (crnode = opal_carto_base_find_node(graph, node->name))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto error; - } - OBJ_CONSTRUCT(&distance, opal_value_array_t); - if (ORTE_SUCCESS != (rc = opal_carto_base_get_nodes_distance(graph, crnode, - "SLOT", &distance))) { - ORTE_ERROR_LOG(rc); - goto error; - } - /* cycle through the nodes in the distance array - these - * should be in order based on distance - */ -#if 0 - /* RHC: need to create a working list of nodes that is ordered - * according to distance. The get_nodes_distance function returns - * this, but it covers -all- nodes, so we have to filter that - * against the allocated node list to create the new - * working_node_list - */ - for (i=0; i < distance.size; i++) { - if - } - for (item = opal_list_get_first(&node_list); - item != opal_list_get_end(&node_list); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - - if (NULL == (crnode = opal_carto.find_node(graph, node->name))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - rc = ORTE_ERR_NOT_FOUND; - goto error; - } - - /* look this node up in the distance array */ -#endif - - /* is this node oversubscribed? */ - node = (orte_node_t*)cur_node_item; - if (node->slots_inuse > node->slots) { - /* work down the list - is there another node that - * would not be oversubscribed? - */ - if (cur_node_item != opal_list_get_end(&node_list)) { - item = opal_list_get_next(cur_node_item); - } else { - item = opal_list_get_first(&node_list); - } - while (item != cur_node_item) { - nd1 = (orte_node_t*)item; - if (nd1->slots_inuse < nd1->slots) { - /* this node is not oversubscribed! use it! */ - cur_node_item = item; - goto proceed; - } - if (item == opal_list_get_end(&node_list)) { - item = opal_list_get_first(&node_list); - } else { - item= opal_list_get_next(item); - } - } - /* if we get here, then we cycled all the way around the - * list without finding a better answer - just use what - * we have - */ - } - - proceed: - if (map->npernode == 1) { - /* there are three use-cases that we need to deal with: - * (a) if -np was not provided, then we just use the number of nodes - * (b) if -np was provided AND #procs > #nodes, then error out - * (c) if -np was provided AND #procs <= #nodes, then launch - * the specified #procs one/node. In this case, we just - * leave app->num_procs alone - */ - if (0 == app->num_procs) { - app->num_procs = num_nodes; - } else if (app->num_procs > num_nodes) { - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs", - true, app->num_procs, num_nodes, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - } else if (map->npernode > 1) { - /* first, let's check to see if there are enough slots/node to - * meet the request - error out if not - */ - slots_per_node = num_slots / num_nodes; - if (map->npernode > slots_per_node) { - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-not-enough-slots", - true, map->npernode, slots_per_node, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - /* there are three use-cases that we need to deal with: - * (a) if -np was not provided, then we just use the n/node * #nodes - * (b) if -np was provided AND #procs > (n/node * #nodes), then error out - * (c) if -np was provided AND #procs <= (n/node * #nodes), then launch - * the specified #procs n/node. In this case, we just - * leave app->num_procs alone - */ - if (0 == app->num_procs) { - /* set the num_procs to equal the specified num/node * the number of nodes */ - app->num_procs = map->npernode * num_nodes; - } else if (app->num_procs > (map->npernode * num_nodes)) { - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-too-many-procs", - true, app->num_procs, map->npernode, num_nodes, num_slots, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - } else if (0 == app->num_procs) { - /** set the num_procs to equal the number of slots on these mapped nodes - if - user has specified "-bynode", then set it to the number of nodes - */ - if (map->policy & ORTE_MAPPING_BYNODE) { - app->num_procs = num_nodes; - } else if (map->policy & ORTE_MAPPING_BYSLOT) { - app->num_procs = num_slots; - } else { - /* we can't handle this - it should have been set when we got - * the map info. If it wasn't, then we can only error out - */ - orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-np-and-user-map", - true, app->num_procs, map->npernode, num_nodes, num_slots, NULL); - rc = ORTE_ERR_SILENT; - goto error; - } - } - - /** track the total number of processes we mapped */ - jdata->num_procs += app->num_procs; - - /* Make assignments */ - if (map->policy == ORTE_MAPPING_BYNODE) { - rc = map_app_by_node(app, jdata, vpid_start, &node_list); - } else { - rc = map_app_by_slot(app, jdata, vpid_start, &node_list); - } - - /* update the starting vpid for the next app_context */ - vpid_start += app->num_procs; - - if (ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - goto error; - } - - /* save the bookmark */ - jdata->bookmark = (orte_node_t*)cur_node_item; - - /* cleanup the node list - it can differ from one app_context - * to another, so we have to get it every time - */ - while(NULL != (item = opal_list_remove_first(&node_list))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&node_list); - } - - /* compute and save convenience values */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* define the daemons that we will use for this job */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(map))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; - -error: - while(NULL != (item = opal_list_remove_first(&node_list))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&node_list); - - return rc; -} - - diff --git a/orte/mca/rmaps/topo/rmaps_topo.h b/orte/mca/rmaps/topo/rmaps_topo.h deleted file mode 100644 index 5440bf15ed..0000000000 --- a/orte/mca/rmaps/topo/rmaps_topo.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Resource Mapping - */ -#ifndef ORTE_RMAPS_TOPO_H -#define ORTE_RMAPS_TOPO_H - -#include "orte_config.h" -#include "orte/mca/rmaps/rmaps.h" - -BEGIN_C_DECLS - -ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_topo_component; -extern orte_rmaps_base_module_t orte_rmaps_topo_module; - - -END_C_DECLS - -#endif diff --git a/orte/mca/rmaps/topo/rmaps_topo_component.c b/orte/mca/rmaps/topo/rmaps_topo_component.c deleted file mode 100644 index 0e26c5043a..0000000000 --- a/orte/mca/rmaps/topo/rmaps_topo_component.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "rmaps_topo.h" - -/* - * Local functions - */ - -static int orte_rmaps_topo_open(void); -static int orte_rmaps_topo_close(void); -static int orte_rmaps_topo_query(mca_base_module_t **module, int *priority); - - -orte_rmaps_base_component_t mca_rmaps_topo_component = { - { - ORTE_RMAPS_BASE_VERSION_2_0_0, - - "topo", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - orte_rmaps_topo_open, /* component open */ - orte_rmaps_topo_close, /* component close */ - orte_rmaps_topo_query /* component query */ - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } -}; - - -/** - * component open/close/init function - */ -static int orte_rmaps_topo_open(void) -{ - return ORTE_SUCCESS; -} - - -static int orte_rmaps_topo_query(mca_base_module_t **module, int *priority) -{ - /* the RMAPS framework is -only- opened on HNP's, - * so no need to check for that here - */ - - *priority = 0; /* only select if specified */ - *module = (mca_base_module_t *)&orte_rmaps_topo_module; - return ORTE_SUCCESS; -} - -/** - * Close all subsystems. - */ - -static int orte_rmaps_topo_close(void) -{ - return ORTE_SUCCESS; -} - - diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index 96734d4e68..ff37d6e0f0 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -860,6 +860,12 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src, maps = (orte_job_map_t**) src; for (i=0; i < num_vals; i++) { + /* pack the mapper used to generate it */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapper), 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the policy used to generate it */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, ORTE_MAPPING_POLICY))) { ORTE_ERROR_LOG(rc); diff --git a/orte/runtime/data_type_support/orte_dt_print_fns.c b/orte/runtime/data_type_support/orte_dt_print_fns.c index 46a852a7f0..a5ead949b1 100644 --- a/orte/runtime/data_type_support/orte_dt_print_fns.c +++ b/orte/runtime/data_type_support/orte_dt_print_fns.c @@ -650,8 +650,8 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat asprintf(&pfx, "%s\t", pfx2); if (orte_devel_level_output) { - asprintf(&tmp, "\n%sMap generated by mapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s", - pfx2, src->policy, pfx2, (long)src->npernode, + asprintf(&tmp, "\n%sMap generated by mapper: %d\tMapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s", + pfx2, src->mapper, src->policy, pfx2, (long)src->npernode, (src->oversubscribe) ? "TRUE" : "FALSE", (src->cpu_lists) ? "TRUE" : "FALSE"); diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 8202b05ff2..6291cac0c7 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -938,6 +938,14 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest, return ORTE_ERR_OUT_OF_RESOURCE; } + /* unpack the mapper */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &(maps[i]->mapper), &n, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* unpack the policy */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 326545441b..d188e38e6d 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -1040,6 +1040,7 @@ OBJ_CLASS_INSTANCE(orte_jmap_t, static void orte_job_map_construct(orte_job_map_t* map) { + map->mapper = ORTE_RMAPS_UNDEF; map->policy = 0; map->npernode = 0; map->nperboard = 0; diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index e9aec4d626..43925a8fa4 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -154,7 +154,16 @@ int orte_err2str(int errnum, const char **errmsg) case ORTE_ERR_COMM_DISABLED: retval = "Communications have been disabled"; break; - + case ORTE_ERR_FAILED_TO_MAP: + retval = "Unable to map job"; + break; + case ORTE_ERR_TAKE_NEXT_OPTION: + if (orte_report_silent_errors) { + retval = "Next option"; + } else { + retval = NULL; + } + break; default: if (orte_report_silent_errors) { retval = "Unknown error";