From 90f5e3fad85382e91664d5b829a959782295ddad Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 2 Jan 2007 16:14:44 +0000 Subject: [PATCH] Fix a buglet in the singleton startup procedure. For purposes of minimizing the xcast message, we "strip" the descriptive info on all subscription messages. This means, though, that we have to store the process name and other info so it can be retrieved in the body of the subscription data (as opposed to in the description). This wasn't being done for singletons because they don't call the RMAPS to "map" themselves. This has now been corrected. The singleton startup will dutifully call the mapper framework so that the proper data storage locations get initialized. Unfortunately, we then had to instruct the RMAPS not to allocate a vpid range for this job - otherwise, it would make a mistake and think there were two processes in it. Hence, a change was required to RMAPS to tell it "map this job, but don't allocate a vpid range for it". This change will need to migrate across to 1.2 after it "soaks" the appropriate time. This commit was SVN r12952. --- orte/mca/rmaps/base/rmaps_base_map_job.c | 49 ++++++++++++++++-------- orte/mca/rmaps/rmaps_types.h | 1 + orte/mca/rmaps/round_robin/rmaps_rr.c | 20 +++++++--- orte/mca/rmaps/round_robin/rmaps_rr.h | 1 + orte/runtime/orte_init_stage1.c | 23 ++++++++++- 5 files changed, 71 insertions(+), 23 deletions(-) diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 69453c6a48..efbb43e52b 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -44,17 +44,27 @@ static orte_rmaps_base_module_t *select_any(void); * Function for selecting one component from all those that are * available. */ -int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) +int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *incoming_attributes) { orte_rmaps_base_module_t *module=NULL; orte_attribute_t *attr; char *desired_mapper; opal_list_t working_attrs; + opal_list_t *attributes; opal_list_item_t *item; orte_jobid_t *jptr, parent_job=ORTE_JOBID_INVALID; orte_job_map_t *map; orte_std_cntr_t scntr; int rc; + bool using_local_attr=false; + + /* check for NULL attributes - we need this list locally */ + if (NULL == incoming_attributes) { + attributes = OBJ_NEW(opal_list_t); + using_local_attr = true; + } else { + attributes = incoming_attributes; + } /* if we are not on the head node, use the proxy component */ if (!orte_process_info.seed) { @@ -74,7 +84,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) */ if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } parent_job = *jptr; /* lookup that job's mapping policy */ @@ -82,7 +92,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_mapping_plan(parent_job, &working_attrs))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&working_attrs); - return rc; + goto CLEANUP; } /* go through the parent policy and "fill" anything that was missing in the * list of attributes provided. We specifically don't overwrite anything provided @@ -93,7 +103,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) ORTE_RMGR_ATTR_NO_OVERRIDE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&working_attrs); - return rc; + goto CLEANUP; } /* clean up */ while (NULL != (item = opal_list_remove_first(&working_attrs))) { @@ -109,14 +119,14 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) ORTE_STRING, "bynode", ORTE_RMGR_ATTR_NO_OVERRIDE))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } } else { if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(attributes, ORTE_RMAPS_MAP_POLICY, ORTE_STRING, "byslot", ORTE_RMGR_ATTR_NO_OVERRIDE))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } } @@ -129,7 +139,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_NO_OVERRIDE))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } } @@ -142,7 +152,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) ORTE_STD_CNTR, &scntr, ORTE_RMGR_ATTR_NO_OVERRIDE))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } } @@ -155,7 +165,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_NO_OVERRIDE))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } } @@ -168,7 +178,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_NO_OVERRIDE))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } } @@ -177,7 +187,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) /* they did - extract its name */ if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&desired_mapper, attr->value, ORTE_STRING))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } module = select_preferred(desired_mapper); } else { @@ -189,13 +199,14 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); opal_output(orte_rmaps_base.rmaps_output, "orte:rmaps:base:map: could not find desired mapper component %s", desired_mapper); - return ORTE_ERR_NOT_FOUND; + rc = ORTE_ERR_NOT_FOUND; + goto CLEANUP; } /* go ahead and map the job */ if (ORTE_SUCCESS != (rc = module->map_job(job, attributes))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } /* store the mapping plan in case we need it later. We need to do this AFTER @@ -206,7 +217,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_store_mapping_plan(job, attributes))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } /* if we were using a parent policy, then we need to update that job's info @@ -216,7 +227,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) if (ORTE_JOBID_INVALID != parent_job) { if (ORTE_SUCCESS != (rc = orte_rmaps_base_update_mapping_state(parent_job, attributes))) { ORTE_ERROR_LOG(rc); - return rc; + goto CLEANUP; } } @@ -228,7 +239,13 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes) } - return ORTE_SUCCESS; +CLEANUP: + /* if we setup our own local attribute list, then get rid of it */ + if (using_local_attr) { + OBJ_RELEASE(attributes); + } + + return rc; } diff --git a/orte/mca/rmaps/rmaps_types.h b/orte/mca/rmaps/rmaps_types.h index 6325bfa998..102cc29599 100644 --- a/orte/mca/rmaps/rmaps_types.h +++ b/orte/mca/rmaps/rmaps_types.h @@ -43,6 +43,7 @@ extern "C" { #define ORTE_RMAPS_USE_PARENT_PLAN "orte-map-use-parent-plan" #define ORTE_RMAPS_BOOKMARK "orte-map-bookmark" #define ORTE_RMAPS_DISPLAY_AFTER_MAP "orte-map-display" +#define ORTE_RMAPS_NO_ALLOC_RANGE "orte-map-no-alloc-range" /**** JOB_MAP OBJECTS ***/ /* diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index 7fccd4fda6..bc923952da 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -316,6 +316,12 @@ static int orte_rmaps_rr_process_attrs(opal_list_t *attributes) mca_rmaps_round_robin_component.oversubscribe = false; } + mca_rmaps_round_robin_component.no_allocate_range = false; + if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RMAPS_NO_ALLOC_RANGE))) { + /* was provided - set boolean accordingly */ + mca_rmaps_round_robin_component.no_allocate_range = true; + } + return ORTE_SUCCESS; } /* @@ -501,11 +507,15 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes) modify_app_context = true; } - /* allocate a vpid range for this app within the job */ - if(ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, app->num_procs, &vpid_start))) { - ORTE_ERROR_LOG(rc); - OBJ_DESTRUCT(&master_node_list); - return rc; + /* allocate a vpid range for this app within the job, unless told not to do so */ + if (mca_rmaps_round_robin_component.no_allocate_range) { + vpid_start = 0; + } else { + if(ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, app->num_procs, &vpid_start))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&master_node_list); + return rc; + } } /** save the initial starting vpid for later */ diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.h b/orte/mca/rmaps/round_robin/rmaps_rr.h index d6b164003b..5ae6b73fbb 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.h +++ b/orte/mca/rmaps/round_robin/rmaps_rr.h @@ -41,6 +41,7 @@ struct orte_rmaps_round_robin_component_t { bool n_per_node; bool no_use_local; bool oversubscribe; + bool no_allocate_range; }; typedef struct orte_rmaps_round_robin_component_t orte_rmaps_round_robin_component_t; diff --git a/orte/runtime/orte_init_stage1.c b/orte/runtime/orte_init_stage1.c index 331c588c19..b71956fd63 100644 --- a/orte/runtime/orte_init_stage1.c +++ b/orte/runtime/orte_init_stage1.c @@ -435,6 +435,7 @@ int orte_init_stage1(bool infrastructure) orte_rds_cell_attr_t *new_attr; orte_ras_node_t *ras_item; opal_list_t attrs; + opal_list_item_t *item; OBJ_CONSTRUCT(&single_host, opal_list_t); OBJ_CONSTRUCT(&rds_single_host, opal_list_t); @@ -532,14 +533,32 @@ int orte_init_stage1(bool infrastructure) you'll end up with the localhost *and* all the other nodes in your allocation on the node segment -- which is probably fine */ - OBJ_CONSTRUCT(&attrs, opal_list_t); - if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, &attrs))) { + if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "allocate for a singleton"; goto error; } + + /* even though the map in this case is trivial, we still + * need to call the RMAPS framework so the proper data + * structures get set into the registry + */ + OBJ_CONSTRUCT(&attrs, opal_list_t); + if (ORTE_SUCCESS != (ret = orte_rmgr.add_attribute(&attrs, ORTE_RMAPS_NO_ALLOC_RANGE, + ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE))) { + ORTE_ERROR_LOG(ret); + error = "could not create attribute for map"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_rmaps.map_job(my_jobid, &attrs))) { + ORTE_ERROR_LOG(ret); + error = "map for a singleton"; + goto error; + } + while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item); OBJ_DESTRUCT(&attrs); + /* cleanup data structs */ OBJ_DESTRUCT(&single_host); OBJ_DESTRUCT(&rds_single_host); }