1
1

Final cleanup of cpus-per-proc for 1.7.4 - provide better checking for cpus-per-proc and mismatched mapping/binding directives, and provide error messages telling the user what to do to get it right.

cmr=v1.7.4:reviewer=jsquyres

This commit was SVN r30438.
Этот коммит содержится в:
Ralph Castain 2014-01-27 22:40:51 +00:00
родитель 791a3a5ec6
Коммит 941bfd4604
3 изменённых файлов: 91 добавлений и 6 удалений

Просмотреть файл

@ -231,3 +231,31 @@ been deprecated and replaced as follows:
The deprecated forms *will* disappear in a future version of Open MPI.
Please update to the new syntax.
#
[mismatch-binding]
A request for multiple cpus-per-proc was given, but a conflicting binding
policy was specified:
#cpus-per-proc: %d
type of cpus: %s
binding policy given: %s
The correct binding policy for the given type of cpu is:
correct binding policy: %s
This is the binding policy we would apply by default for this
situation, so no binding need be specified. Please correct the
situation and try again.
#
[mapping-too-low]
A request for multiple cpus-per-proc was given, but a directive
was also give to map to an object level that is unlikely to
have multiple cpus underneath it:
#cpus-per-proc: %d
map-by: %s
Please specify a mapping level that is no lower than socket, or
else let us define a default mapping that will allow multiple
cpus-per-proc.

Просмотреть файл

@ -355,6 +355,43 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
}
if (1 < orte_rmaps_base.cpus_per_rank) {
/* check to see if we were told to map at too low a level */
if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) > ORTE_MAPPING_BYSOCKET) {
orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true,
orte_rmaps_base.cpus_per_rank,
orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
return ORTE_ERR_SILENT;
}
/* if we were asked for multiple cpus/proc, then we have to
* bind to those cpus - any other binding policy is an
* error
*/
if (OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
if (opal_hwloc_use_hwthreads_as_cpus) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
} else {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
}
} else {
if (opal_hwloc_use_hwthreads_as_cpus &&
(OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy))) {
orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true,
orte_rmaps_base.cpus_per_rank, "use-hwthreads-as-cpus",
opal_hwloc_base_print_binding(opal_hwloc_binding_policy),
"bind-to hwthread");
return ORTE_ERR_SILENT;
} else if (OPAL_BIND_TO_CORE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true,
orte_rmaps_base.cpus_per_rank, "cores as cpus",
opal_hwloc_base_print_binding(opal_hwloc_binding_policy),
"bind-to core");
return ORTE_ERR_SILENT;
}
}
}
/* Should we schedule on the local node or not? */
if (rmaps_base_no_schedule_local) {
orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL;

Просмотреть файл

@ -106,9 +106,19 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
} else {
/* default based on number of procs */
if (nprocs <= 2) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not given - using byslot");
ORTE_SET_MAPPING_POLICY(map->mapping, ORTE_MAPPING_BYSLOT);
if (1 < orte_rmaps_base.cpus_per_rank) {
/* assigning multiple cpus to a rank requires that we map to
* objects that have multiple cpus in them, so default
* to byslot if nothing else was specified by the user.
*/
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not given - using byslot");
ORTE_SET_MAPPING_POLICY(map->mapping, ORTE_MAPPING_BYSLOT);
} else {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not given - using bycore");
ORTE_SET_MAPPING_POLICY(map->mapping, ORTE_MAPPING_BYCORE);
}
} else {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not given - using bysocket");
@ -167,9 +177,19 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
/* default based on number of procs */
if (nprocs <= 2) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not set by user - using byslot");
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
if (1 < orte_rmaps_base.cpus_per_rank) {
/* assigning multiple cpus to a rank requires that we map to
* objects that have multiple cpus in them, so default
* to byslot if nothing else was specified by the user.
*/
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not given - using byslot");
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
} else {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not given - using bycore");
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
}
} else {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps mapping not set by user - using bysocket");