Fixed possible seqf when using RANKFILE, but not all ranks assigned
Fixed allocation of all ranks when using RANKFILE, but not all ranks assigned Aborting if using RANKFILE, but np wasn't specified a little earlier Clean mca_rmaps_rank_file_component.debug This commit was SVN r19004.
Этот коммит содержится в:
родитель
0646cd2491
Коммит
b4d54dda57
@ -968,14 +968,16 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
|
||||
free(value);
|
||||
|
||||
param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list");
|
||||
if ( NULL != child->slot_list ) {
|
||||
param = mca_base_param_environ_variable("opal", NULL, "paffinity_base_slot_list");
|
||||
asprintf(&value, "%s", child->slot_list);
|
||||
opal_setenv(param, value, true, &app->env);
|
||||
free(param);
|
||||
free(value);
|
||||
} else {
|
||||
opal_unsetenv(param, &app->env);
|
||||
}
|
||||
|
||||
free(param);
|
||||
|
||||
/* must unlock prior to fork to keep things clean in the
|
||||
* event library
|
||||
*/
|
||||
|
@ -110,10 +110,6 @@ static int map_app_by_user_map(
|
||||
}
|
||||
} while ( strcmp(node->name, rankmap[num_alloc + vpid_start].node_name));
|
||||
node->slot_list = strdup(rankmap[num_alloc+vpid_start].slot_list);
|
||||
if (mca_rmaps_rank_file_component.debug) {
|
||||
opal_output(0, "rank_file RMAPS component: [%s:%d]->slot_list=%s\n",
|
||||
rankmap[num_alloc + vpid_start].node_name,rankmap[num_alloc+vpid_start].rank, node->slot_list);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rankmap[num_alloc+vpid_start].rank, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
@ -377,7 +373,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
vpid_start = 0;
|
||||
|
||||
/* cycle through the app_contexts, mapping them sequentially */
|
||||
for(i=0; i < jdata->num_apps; i++) {
|
||||
for(i=0; i < jdata->num_apps; i++) {
|
||||
app = apps[i];
|
||||
|
||||
/* if the number of processes wasn't specified, then we know there can be only
|
||||
@ -439,16 +435,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
if (map->pernode && map->npernode == 1) {
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the number of nodes
|
||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||
* (c) if -np was provided AND #procs <= #nodes, then launch
|
||||
* the specified #procs one/node. In this case, we just
|
||||
* leave app->num_procs alone
|
||||
*/
|
||||
if (0 == app->num_procs) {
|
||||
app->num_procs = num_nodes;
|
||||
} else if (app->num_procs > num_nodes) {
|
||||
if (app->num_procs > num_nodes) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:per-node-and-too-many-procs",
|
||||
true, app->num_procs, num_nodes, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
@ -465,30 +452,12 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the n/node * #nodes
|
||||
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
|
||||
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
|
||||
* the specified #procs n/node. In this case, we just
|
||||
* leave app->num_procs alone
|
||||
*/
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to equal the specified num/node * the number of nodes */
|
||||
app->num_procs = map->npernode * num_nodes;
|
||||
} else if (app->num_procs > (map->npernode * num_nodes)) {
|
||||
if (app->num_procs > (map->npernode * num_nodes)) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:n-per-node-and-too-many-procs",
|
||||
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else if (0 == app->num_procs) {
|
||||
/* we can't handle this - it should have been set when we got
|
||||
* the map info. If it wasn't, then we can only error out
|
||||
*/
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:no-np-and-user-map",
|
||||
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/** track the total number of processes we mapped */
|
||||
jdata->num_procs += app->num_procs;
|
||||
@ -589,6 +558,11 @@ static int orte_rmaps_rank_file_parse(const char *rankfile, int np)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if ( 0 == np ) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:no-np-and-user-map", true, NULL);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
while (!orte_rmaps_rank_file_done) {
|
||||
token = orte_rmaps_rank_file_lex();
|
||||
switch (token) {
|
||||
|
@ -75,6 +75,7 @@ orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component = {
|
||||
static int orte_rmaps_rank_file_open(void)
|
||||
{
|
||||
mca_rmaps_rank_file_component.priority = 0;
|
||||
int index = 0;
|
||||
|
||||
mca_base_param_reg_string(&mca_rmaps_rank_file_component.super.base_version,
|
||||
"path",
|
||||
@ -83,6 +84,15 @@ static int orte_rmaps_rank_file_open(void)
|
||||
if (NULL != orte_rmaps_rank_file_path) {
|
||||
mca_rmaps_rank_file_component.priority = 100;
|
||||
}
|
||||
|
||||
index = mca_base_param_find("opal", NULL, "paffinity_base_slot_list");
|
||||
if (index >= 0) {
|
||||
if (OPAL_SUCCESS == mca_base_param_lookup_string(index, &orte_mca_rmaps_rank_file_slot_list)) {
|
||||
if (NULL != orte_mca_rmaps_rank_file_slot_list) {
|
||||
mca_rmaps_rank_file_component.priority = 100;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user