1
1

Separately track requested and last-used mapper so we don't lose that info

This commit was SVN r24502.
Этот коммит содержится в:
Ralph Castain 2011-03-09 18:51:36 +00:00
родитель 21d441aec5
Коммит 3b4421d8e3
13 изменённых файлов: 93 добавлений и 35 удалений

Просмотреть файл

@ -107,6 +107,8 @@ ORTE_DECLSPEC int orte_rmaps_base_get_vpid_range(orte_jobid_t jobid,
ORTE_DECLSPEC int orte_rmaps_base_set_vpid_range(orte_jobid_t jobid, ORTE_DECLSPEC int orte_rmaps_base_set_vpid_range(orte_jobid_t jobid,
orte_vpid_t start, orte_vpid_t range); orte_vpid_t start, orte_vpid_t range);
ORTE_DECLSPEC char* orte_rmaps_base_print_mapper(orte_rmaps_mapper_type_t mapper);
/** /**
* Close down the rmaps framework * Close down the rmaps framework
*/ */

Просмотреть файл

@ -79,15 +79,18 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
map->stride = orte_rmaps_base.stride; map->stride = orte_rmaps_base.stride;
map->oversubscribe = orte_rmaps_base.oversubscribe; map->oversubscribe = orte_rmaps_base.oversubscribe;
map->display_map = orte_rmaps_base.display_map; map->display_map = orte_rmaps_base.display_map;
map->mapper = orte_rmaps_base.default_mapper; map->req_mapper = orte_rmaps_base.default_mapper;
/* assign the map object to this job */ /* assign the map object to this job */
jdata->map = map; jdata->map = map;
} else { } else {
if (!jdata->map->display_map) { if (!jdata->map->display_map) {
jdata->map->display_map = orte_rmaps_base.display_map; jdata->map->display_map = orte_rmaps_base.display_map;
} }
if (ORTE_RMAPS_UNDEF == jdata->map->mapper) { if (ORTE_RMAPS_UNDEF == jdata->map->req_mapper) {
jdata->map->mapper = orte_rmaps_base.default_mapper; jdata->map->req_mapper = orte_rmaps_base.default_mapper;
}
if (0 == jdata->map->policy) {
jdata->map->policy = orte_default_mapping_policy;
} }
} }

Просмотреть файл

@ -890,3 +890,23 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
char* orte_rmaps_base_print_mapper(orte_rmaps_mapper_type_t mapper)
{
switch(mapper) {
case ORTE_RMAPS_UNDEF:
return "UNDEF";
case ORTE_RMAPS_RR:
return "ROUND_ROBIN";
case ORTE_RMAPS_LOADBALANCE:
return "LOADBALANCE";
case ORTE_RMAPS_SEQ:
return "SEQUENTIAL";
case ORTE_RMAPS_RF:
return "RANK_FILE";
case ORTE_RMAPS_RESILIENT:
return "RESILIENT";
default:
return "UNKNOWN";
}
}

Просмотреть файл

@ -60,11 +60,12 @@ static int switchyard(orte_job_t *jdata)
*/ */
if (ORTE_JOB_STATE_INIT != jdata->state) { if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: not job %s not in initial state - loadbalance cannot map", "mca:rmaps:lb: job %s not in initial state - loadbalance cannot map",
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (0 < jdata->map->mapper && ORTE_RMAPS_LOADBALANCE != jdata->map->mapper) { if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_LOADBALANCE != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */ /* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: job %s not using loadbalance mapper", "mca:rmaps:lb: job %s not using loadbalance mapper",
@ -77,7 +78,7 @@ static int switchyard(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */ /* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_LOADBALANCE; jdata->map->last_mapper = ORTE_RMAPS_LOADBALANCE;
if (0 < orte_rmaps_base.npernode) { if (0 < orte_rmaps_base.npernode) {
rc = npernode(jdata); rc = npernode(jdata);

Просмотреть файл

@ -297,11 +297,12 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
/* only handle initial launch of rf job */ /* only handle initial launch of rf job */
if (ORTE_JOB_STATE_INIT != jdata->state) { if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rf: not job %s not in initial state - rank_file cannot map", "mca:rmaps:rf: job %s not in initial state - rank_file cannot map",
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (0 < jdata->map->mapper && ORTE_RMAPS_RF != jdata->map->mapper) { if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_RF != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */ /* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rf: job %s not using rank_file mapper", "mca:rmaps:rf: job %s not using rank_file mapper",
@ -314,7 +315,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */ /* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_RF; jdata->map->last_mapper = ORTE_RMAPS_RF;
/* convenience def */ /* convenience def */
map = jdata->map; map = jdata->map;

Просмотреть файл

@ -41,7 +41,7 @@
* Local variable * Local variable
*/ */
static char *orte_getline(FILE *fp); static char *orte_getline(FILE *fp);
static bool have_ftgrps=false; static bool have_ftgrps=false, made_ftgrps=false;
static int construct_ftgrps(void); static int construct_ftgrps(void);
static int get_ftgrp_target(orte_proc_t *proc, static int get_ftgrp_target(orte_proc_t *proc,
@ -69,16 +69,25 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
orte_std_cntr_t num_slots; orte_std_cntr_t num_slots;
opal_list_item_t *item; opal_list_item_t *item;
if (0 < jdata->map->mapper && ORTE_RMAPS_RESILIENT != jdata->map->mapper) { if (ORTE_JOB_STATE_INIT == jdata->state) {
if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_RESILIENT != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: job %s not using loadbalance mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (NULL == mca_rmaps_resilient_component.fault_group_file) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot perform initial map of job %s - no fault groups",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
} else if (ORTE_JOB_STATE_RESTART != jdata->state &&
ORTE_JOB_STATE_PROCS_MIGRATING != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot map job %s - other mapper specified", "mca:rmaps:resilient: cannot map job %s - not in restart or migrating",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
if (ORTE_JOB_STATE_INIT == jdata->state &&
NULL == mca_rmaps_resilient_component.fault_group_file) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:resilient: cannot perform initial map of job %s",
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
@ -88,10 +97,10 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */ /* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_RESILIENT; jdata->map->last_mapper = ORTE_RMAPS_RESILIENT;
/* have we already constructed the fault group list? */ /* have we already constructed the fault group list? */
if (!have_ftgrps) { if (!made_ftgrps) {
construct_ftgrps(); construct_ftgrps();
} }
@ -288,7 +297,7 @@ static int construct_ftgrps(void)
int i, k; int i, k;
/* flag that we did this */ /* flag that we did this */
have_ftgrps = true; made_ftgrps = true;
if (NULL == mca_rmaps_resilient_component.fault_group_file) { if (NULL == mca_rmaps_resilient_component.fault_group_file) {
/* nothing to build */ /* nothing to build */
@ -337,6 +346,8 @@ static int construct_ftgrps(void)
} }
fclose(fp); fclose(fp);
/* flag that we have fault grps */
have_ftgrps = true;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -51,7 +51,8 @@ typedef enum orte_rmaps_mapper_type_t orte_rmaps_mapper_type_t;
struct orte_job_map_t { struct orte_job_map_t {
opal_object_t super; opal_object_t super;
/* user-specified mapping params */ /* user-specified mapping params */
orte_rmaps_mapper_type_t mapper; orte_rmaps_mapper_type_t req_mapper; /* requested mapper */
orte_rmaps_mapper_type_t last_mapper; /* last mapper used */
orte_mapping_policy_t policy; orte_mapping_policy_t policy;
int npernode; int npernode;
int nperboard; int nperboard;

Просмотреть файл

@ -60,12 +60,13 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
*/ */
if (ORTE_JOB_STATE_INIT != jdata->state) { if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: not job %s in state %s - rr cannot map", "mca:rmaps:rr: job %s in state %s - rr cannot map",
ORTE_JOBID_PRINT(jdata->jobid), ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(jdata->state)); orte_job_state_to_str(jdata->state));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (0 < jdata->map->mapper && ORTE_RMAPS_RR != jdata->map->mapper) { if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_RR != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */ /* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr: job %s not using rr mapper", "mca:rmaps:rr: job %s not using rr mapper",
@ -78,7 +79,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */ /* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_RR; jdata->map->last_mapper = ORTE_RMAPS_RR;
/* start at the beginning... */ /* start at the beginning... */
jdata->num_procs = 0; jdata->num_procs = 0;

Просмотреть файл

@ -82,11 +82,12 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
*/ */
if (ORTE_JOB_STATE_INIT != jdata->state) { if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:seq: not job %s not in initial state - seq cannot map", "mca:rmaps:seq: job %s not in initial state - seq cannot map",
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
if (0 < jdata->map->mapper && ORTE_RMAPS_SEQ != jdata->map->mapper) { if (ORTE_RMAPS_UNDEF != jdata->map->req_mapper &&
ORTE_RMAPS_SEQ != jdata->map->req_mapper) {
/* a mapper has been specified, and it isn't me */ /* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output, opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:seq: job %s not using sequential mapper", "mca:rmaps:seq: job %s not using sequential mapper",
@ -99,7 +100,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)); ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */ /* flag that I did the mapping */
jdata->map->mapper = ORTE_RMAPS_SEQ; jdata->map->last_mapper = ORTE_RMAPS_SEQ;
/* conveniece def */ /* conveniece def */
map = jdata->map; map = jdata->map;

Просмотреть файл

@ -871,8 +871,14 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
maps = (orte_job_map_t**) src; maps = (orte_job_map_t**) src;
for (i=0; i < num_vals; i++) { for (i=0; i < num_vals; i++) {
/* pack the requested mapper */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->req_mapper), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the mapper used to generate it */ /* pack the mapper used to generate it */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapper), 1, OPAL_INT32))) { if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->last_mapper), 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }

Просмотреть файл

@ -27,6 +27,7 @@
#include "opal/mca/sysinfo/sysinfo.h" #include "opal/mca/sysinfo/sysinfo.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/error_strings.h" #include "orte/util/error_strings.h"
@ -652,8 +653,9 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
asprintf(&pfx, "%s\t", pfx2); asprintf(&pfx, "%s\t", pfx2);
if (orte_devel_level_output) { if (orte_devel_level_output) {
asprintf(&tmp, "\n%sMap generated by mapper: %d\tMapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s", asprintf(&tmp, "\n%sMapper requested: %d\tLast mapper: %d\tMapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
pfx2, src->mapper, src->policy, pfx2, (long)src->npernode, pfx2, orte_rmaps_base_print_mapper(src->req_mapper),
orte_rmaps_base_print_mapper(src->last_mapper), src->policy, pfx2, (long)src->npernode,
(src->oversubscribe) ? "TRUE" : "FALSE", (src->oversubscribe) ? "TRUE" : "FALSE",
(src->cpu_lists) ? "TRUE" : "FALSE"); (src->cpu_lists) ? "TRUE" : "FALSE");

Просмотреть файл

@ -952,10 +952,18 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
/* unpack the mapper */ /* unpack the requested mapper */
n = 1; n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->mapper), &n, OPAL_INT32))) { &(maps[i]->req_mapper), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the mapper used */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->last_mapper), &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }

Просмотреть файл

@ -1044,7 +1044,8 @@ OBJ_CLASS_INSTANCE(orte_jmap_t,
static void orte_job_map_construct(orte_job_map_t* map) static void orte_job_map_construct(orte_job_map_t* map)
{ {
map->mapper = ORTE_RMAPS_UNDEF; map->req_mapper = ORTE_RMAPS_UNDEF;
map->last_mapper = ORTE_RMAPS_UNDEF;
map->policy = 0; map->policy = 0;
map->npernode = 0; map->npernode = 0;
map->nperboard = 0; map->nperboard = 0;