Fix a buglet in the singleton startup procedure. For purposes of minimizing the xcast message, we "strip" the descriptive info on all subscription messages. This means, though, that we have to store the process name and other info so it can be retrieved in the body of the subscription data (as opposed to in the description). This wasn't being done for singletons because they don't call the RMAPS to "map" themselves.
This has now been corrected. The singleton startup will dutifully call the mapper framework so that the proper data storage locations get initialized. Unfortunately, we then had to instruct the RMAPS not to allocate a vpid range for this job - otherwise, it would make a mistake and think there were two processes in it. Hence, a change was required to RMAPS to tell it "map this job, but don't allocate a vpid range for it". This change will need to migrate across to 1.2 after it "soaks" the appropriate time. This commit was SVN r12952.
Этот коммит содержится в:
родитель
3cde822d98
Коммит
90f5e3fad8
orte
mca/rmaps
runtime
@ -44,17 +44,27 @@ static orte_rmaps_base_module_t *select_any(void);
|
||||
* Function for selecting one component from all those that are
|
||||
* available.
|
||||
*/
|
||||
int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *incoming_attributes)
|
||||
{
|
||||
orte_rmaps_base_module_t *module=NULL;
|
||||
orte_attribute_t *attr;
|
||||
char *desired_mapper;
|
||||
opal_list_t working_attrs;
|
||||
opal_list_t *attributes;
|
||||
opal_list_item_t *item;
|
||||
orte_jobid_t *jptr, parent_job=ORTE_JOBID_INVALID;
|
||||
orte_job_map_t *map;
|
||||
orte_std_cntr_t scntr;
|
||||
int rc;
|
||||
bool using_local_attr=false;
|
||||
|
||||
/* check for NULL attributes - we need this list locally */
|
||||
if (NULL == incoming_attributes) {
|
||||
attributes = OBJ_NEW(opal_list_t);
|
||||
using_local_attr = true;
|
||||
} else {
|
||||
attributes = incoming_attributes;
|
||||
}
|
||||
|
||||
/* if we are not on the head node, use the proxy component */
|
||||
if (!orte_process_info.seed) {
|
||||
@ -74,7 +84,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&jptr, attr->value, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
parent_job = *jptr;
|
||||
/* lookup that job's mapping policy */
|
||||
@ -82,7 +92,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_mapping_plan(parent_job, &working_attrs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&working_attrs);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* go through the parent policy and "fill" anything that was missing in the
|
||||
* list of attributes provided. We specifically don't overwrite anything provided
|
||||
@ -93,7 +103,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&working_attrs);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* clean up */
|
||||
while (NULL != (item = opal_list_remove_first(&working_attrs))) {
|
||||
@ -109,14 +119,14 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
ORTE_STRING, "bynode",
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(attributes, ORTE_RMAPS_MAP_POLICY,
|
||||
ORTE_STRING, "byslot",
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,7 +139,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
ORTE_UNDEF, NULL,
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
@ -142,7 +152,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
ORTE_STD_CNTR, &scntr,
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
@ -155,7 +165,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
ORTE_UNDEF, NULL,
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
@ -168,7 +178,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
ORTE_UNDEF, NULL,
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
@ -177,7 +187,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
/* they did - extract its name */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&desired_mapper, attr->value, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
module = select_preferred(desired_mapper);
|
||||
} else {
|
||||
@ -189,13 +199,14 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
opal_output(orte_rmaps_base.rmaps_output,
|
||||
"orte:rmaps:base:map: could not find desired mapper component %s", desired_mapper);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* go ahead and map the job */
|
||||
if (ORTE_SUCCESS != (rc = module->map_job(job, attributes))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* store the mapping plan in case we need it later. We need to do this AFTER
|
||||
@ -206,7 +217,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_store_mapping_plan(job, attributes))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* if we were using a parent policy, then we need to update that job's info
|
||||
@ -216,7 +227,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
if (ORTE_JOBID_INVALID != parent_job) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_update_mapping_state(parent_job, attributes))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
@ -228,7 +239,13 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
CLEANUP:
|
||||
/* if we setup our own local attribute list, then get rid of it */
|
||||
if (using_local_attr) {
|
||||
OBJ_RELEASE(attributes);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
@ -43,6 +43,7 @@ extern "C" {
|
||||
#define ORTE_RMAPS_USE_PARENT_PLAN "orte-map-use-parent-plan"
|
||||
#define ORTE_RMAPS_BOOKMARK "orte-map-bookmark"
|
||||
#define ORTE_RMAPS_DISPLAY_AFTER_MAP "orte-map-display"
|
||||
#define ORTE_RMAPS_NO_ALLOC_RANGE "orte-map-no-alloc-range"
|
||||
|
||||
/**** JOB_MAP OBJECTS ***/
|
||||
/*
|
||||
|
@ -316,6 +316,12 @@ static int orte_rmaps_rr_process_attrs(opal_list_t *attributes)
|
||||
mca_rmaps_round_robin_component.oversubscribe = false;
|
||||
}
|
||||
|
||||
mca_rmaps_round_robin_component.no_allocate_range = false;
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RMAPS_NO_ALLOC_RANGE))) {
|
||||
/* was provided - set boolean accordingly */
|
||||
mca_rmaps_round_robin_component.no_allocate_range = true;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/*
|
||||
@ -501,11 +507,15 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
modify_app_context = true;
|
||||
}
|
||||
|
||||
/* allocate a vpid range for this app within the job */
|
||||
if(ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, app->num_procs, &vpid_start))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&master_node_list);
|
||||
return rc;
|
||||
/* allocate a vpid range for this app within the job, unless told not to do so */
|
||||
if (mca_rmaps_round_robin_component.no_allocate_range) {
|
||||
vpid_start = 0;
|
||||
} else {
|
||||
if(ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, app->num_procs, &vpid_start))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&master_node_list);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/** save the initial starting vpid for later */
|
||||
|
@ -41,6 +41,7 @@ struct orte_rmaps_round_robin_component_t {
|
||||
bool n_per_node;
|
||||
bool no_use_local;
|
||||
bool oversubscribe;
|
||||
bool no_allocate_range;
|
||||
};
|
||||
typedef struct orte_rmaps_round_robin_component_t orte_rmaps_round_robin_component_t;
|
||||
|
||||
|
@ -435,6 +435,7 @@ int orte_init_stage1(bool infrastructure)
|
||||
orte_rds_cell_attr_t *new_attr;
|
||||
orte_ras_node_t *ras_item;
|
||||
opal_list_t attrs;
|
||||
opal_list_item_t *item;
|
||||
|
||||
OBJ_CONSTRUCT(&single_host, opal_list_t);
|
||||
OBJ_CONSTRUCT(&rds_single_host, opal_list_t);
|
||||
@ -532,14 +533,32 @@ int orte_init_stage1(bool infrastructure)
|
||||
you'll end up with the localhost *and* all the other
|
||||
nodes in your allocation on the node segment -- which
|
||||
is probably fine */
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, &attrs))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ras.allocate_job(my_jobid, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "allocate for a singleton";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* even though the map in this case is trivial, we still
|
||||
* need to call the RMAPS framework so the proper data
|
||||
* structures get set into the registry
|
||||
*/
|
||||
OBJ_CONSTRUCT(&attrs, opal_list_t);
|
||||
if (ORTE_SUCCESS != (ret = orte_rmgr.add_attribute(&attrs, ORTE_RMAPS_NO_ALLOC_RANGE,
|
||||
ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "could not create attribute for map";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_rmaps.map_job(my_jobid, &attrs))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "map for a singleton";
|
||||
goto error;
|
||||
}
|
||||
while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
|
||||
/* cleanup data structs */
|
||||
OBJ_DESTRUCT(&single_host);
|
||||
OBJ_DESTRUCT(&rds_single_host);
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user