1
1

Separately track requested and last-used mapper so we don't lose that info

This commit was SVN r24502.
Этот коммит содержится в:
Ralph Castain 2011-03-09 18:51:36 +00:00
родитель 21d441aec5
Коммит 3b4421d8e3
13 изменённых файлов: 93 добавлений и 35 удалений

Просмотреть файл

@ -107,6 +107,8 @@ ORTE_DECLSPEC int orte_rmaps_base_get_vpid_range(orte_jobid_t jobid,
ORTE_DECLSPEC int orte_rmaps_base_set_vpid_range(orte_jobid_t jobid,
orte_vpid_t start, orte_vpid_t range);
ORTE_DECLSPEC char* orte_rmaps_base_print_mapper(orte_rmaps_mapper_type_t mapper);
/**
* Close down the rmaps framework
*/

Просмотреть файл

@ -79,15 +79,18 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
map->stride = orte_rmaps_base.stride;
map->oversubscribe = orte_rmaps_base.oversubscribe;
map->display_map = orte_rmaps_base.display_map;
map->mapper = orte_rmaps_base.default_mapper;
map->req_mapper = orte_rmaps_base.default_mapper;
/* assign the map object to this job */
jdata->map = map;
} else {
if (!jdata->map->display_map) {
jdata->map->display_map = orte_rmaps_base.display_map;
}
if (ORTE_RMAPS_UNDEF == jdata->map->mapper) {
jdata->map->mapper = orte_rmaps_base.default_mapper;
if (ORTE_RMAPS_UNDEF == jdata->map->req_mapper) {
jdata->map->req_mapper = orte_rmaps_base.default_mapper;
}
if (0 == jdata->map->policy) {
jdata->map->policy = orte_default_mapping_policy;
}
}

Просмотреть файл

@ -890,3 +890,23 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
return ORTE_SUCCESS;
}
char* orte_rmaps_base_print_mapper(orte_rmaps_mapper_type_t mapper)
{
switch(mapper) {
case ORTE_RMAPS_UNDEF:
return "UNDEF";
case ORTE_RMAPS_RR:
return "ROUND_ROBIN";
case ORTE_RMAPS_LOADBALANCE:
return "LOADBALANCE";
case ORTE_RMAPS_SEQ:
return "SEQUENTIAL";
case ORTE_RMAPS_RF:
return "RANK_FILE";
case ORTE_RMAPS_RESILIENT:
return "RESILIENT";
default:
return "UNKNOWN";
}
}

Просмотреть файл

@ -60,11 +60,12 @@ static int switchyard(orte_job_t *jdata)
*/
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: not job %s not in initial state - loadbalance cannot map",
"mca:rmaps:lb: job %s not in initial state - loadbalance cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (0 < jdata->map->mapper && ORTE_RMAPS_LOADBALANCE != jdata->map->mapper) {
if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_LOADBALANCE != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: job %s not using loadbalance mapper",
@ -77,7 +78,7 @@ static int switchyard(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_LOADBALANCE;
jdata->map->last_mapper = ORTE_RMAPS_LOADBALANCE;
if (0 < orte_rmaps_base.npernode) {
rc = npernode(jdata);

Просмотреть файл

@ -297,11 +297,12 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
/* only handle initial launch of rf job */
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rf: not job %s not in initial state - rank_file cannot map",
"mca:rmaps:rf: job %s not in initial state - rank_file cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (0 < jdata->map->mapper && ORTE_RMAPS_RF != jdata->map->mapper) {
if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_RF != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rf: job %s not using rank_file mapper",
@ -314,7 +315,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_RF;
jdata->map->last_mapper = ORTE_RMAPS_RF;
/* convenience def */
map = jdata->map;

Просмотреть файл

@ -41,7 +41,7 @@
* Local variable
*/
static char *orte_getline(FILE *fp);
static bool have_ftgrps=false;
static bool have_ftgrps=false, made_ftgrps=false;
static int construct_ftgrps(void);
static int get_ftgrp_target(orte_proc_t *proc,
@ -69,16 +69,25 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
orte_std_cntr_t num_slots;
opal_list_item_t *item;
if (0 < jdata->map->mapper && ORTE_RMAPS_RESILIENT != jdata->map->mapper) {
if (ORTE_JOB_STATE_INIT == jdata->state) {
if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_RESILIENT != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: job %s not using loadbalance mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL == mca_rmaps_resilient_component.fault_group_file) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot perform initial map of job %s - no fault groups",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
} else if (ORTE_JOB_STATE_RESTART != jdata->state &&
ORTE_JOB_STATE_PROCS_MIGRATING != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot map job %s - other mapper specified",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (ORTE_JOB_STATE_INIT == jdata->state &&
NULL == mca_rmaps_resilient_component.fault_group_file) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot perform initial map of job %s",
"mca:rmaps:resilient: cannot map job %s - not in restart or migrating",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
@ -88,10 +97,10 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_RESILIENT;
jdata->map->last_mapper = ORTE_RMAPS_RESILIENT;
/* have we already constructed the fault group list? */
if (!have_ftgrps) {
if (!made_ftgrps) {
construct_ftgrps();
}
@ -288,7 +297,7 @@ static int construct_ftgrps(void)
int i, k;
/* flag that we did this */
have_ftgrps = true;
made_ftgrps = true;
if (NULL == mca_rmaps_resilient_component.fault_group_file) {
/* nothing to build */
@ -337,6 +346,8 @@ static int construct_ftgrps(void)
}
fclose(fp);
/* flag that we have fault grps */
have_ftgrps = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -51,7 +51,8 @@ typedef enum orte_rmaps_mapper_type_t orte_rmaps_mapper_type_t;
struct orte_job_map_t {
opal_object_t super;
/* user-specified mapping params */
orte_rmaps_mapper_type_t mapper;
orte_rmaps_mapper_type_t req_mapper; /* requested mapper */
orte_rmaps_mapper_type_t last_mapper; /* last mapper used */
orte_mapping_policy_t policy;
int npernode;
int nperboard;

Просмотреть файл

@ -60,12 +60,13 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
*/
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: not job %s in state %s - rr cannot map",
"mca:rmaps:rr: job %s in state %s - rr cannot map",
ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(jdata->state));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (0 < jdata->map->mapper && ORTE_RMAPS_RR != jdata->map->mapper) {
if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_RR != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: job %s not using rr mapper",
@ -78,7 +79,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_RR;
jdata->map->last_mapper = ORTE_RMAPS_RR;
/* start at the beginning... */
jdata->num_procs = 0;

Просмотреть файл

@ -82,11 +82,12 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
*/
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:seq: not job %s not in initial state - seq cannot map",
"mca:rmaps:seq: job %s not in initial state - seq cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (0 < jdata->map->mapper && ORTE_RMAPS_SEQ != jdata->map->mapper) {
if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_SEQ != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:seq: job %s not using sequential mapper",
@ -99,7 +100,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_SEQ;
jdata->map->last_mapper = ORTE_RMAPS_SEQ;
/* conveniece def */
map = jdata->map;

Просмотреть файл

@ -871,8 +871,14 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
maps = (orte_job_map_t**) src;
for (i=0; i < num_vals; i++) {
/* pack the requested mapper */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->req_mapper), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the mapper used to generate it */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapper), 1, OPAL_INT32))) {
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->last_mapper), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -27,6 +27,7 @@
#include "opal/mca/sysinfo/sysinfo.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h"
#include "opal/dss/dss.h"
#include "orte/util/name_fns.h"
#include "orte/util/error_strings.h"
@ -652,8 +653,9 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
asprintf(&pfx, "%s\t", pfx2);
if (orte_devel_level_output) {
asprintf(&tmp, "\n%sMap generated by mapper: %d\tMapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
pfx2, src->mapper, src->policy, pfx2, (long)src->npernode,
asprintf(&tmp, "\n%sMapper requested: %d\tLast mapper: %d\tMapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
pfx2, orte_rmaps_base_print_mapper(src->req_mapper),
orte_rmaps_base_print_mapper(src->last_mapper), src->policy, pfx2, (long)src->npernode,
(src->oversubscribe) ? "TRUE" : "FALSE",
(src->cpu_lists) ? "TRUE" : "FALSE");

Просмотреть файл

@ -952,10 +952,18 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* unpack the mapper */
/* unpack the requested mapper */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->mapper), &n, OPAL_INT32))) {
&(maps[i]->req_mapper), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the mapper used */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->last_mapper), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -1044,7 +1044,8 @@ OBJ_CLASS_INSTANCE(orte_jmap_t,
static void orte_job_map_construct(orte_job_map_t* map)
{
map->mapper = ORTE_RMAPS_UNDEF;
map->req_mapper = ORTE_RMAPS_UNDEF;
map->last_mapper = ORTE_RMAPS_UNDEF;
map->policy = 0;
map->npernode = 0;
map->nperboard = 0;