Make the cpus-per-proc equivalent a little more intuitive:
* allow users to specify just a modifier for map-by instead of requiring that they also specify a policy. Thus, we now accept --map-by :pe=3 as indicating that we should use the default mapping policy, but bind 3 cpus/proc. * if users specify a pe's/proc but no policy, default to --map-by NUMA to ensure we have access to multiple cpus for the request. This won't guarantee we have access to enough to meet the request, but gives us a chance. In addition, we know that binding a proc to multiple cpus will work best if those cpus are all in the same NUMA, so this provides some degree of optimized behavior. Per a request from Jeff, define "oversubscribe" for binding as a synonym for the "overload" modifier. cmr=v1.8.2:reviewer=rhc This commit was SVN r31967.
Этот коммит содержится в:
родитель
8db76e9c6f
Коммит
06dbfa3098
@ -520,38 +520,44 @@ int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec
|
||||
if (1 < opal_argv_count(tmpvals)) {
|
||||
quals = opal_argv_split(tmpvals[1], ',');
|
||||
for (i=0; NULL != quals[i]; i++) {
|
||||
if (0 == strcasecmp(quals[i], "if-supported")) {
|
||||
if (0 == strncasecmp(quals[i], "if-supported", strlen(quals[i]))) {
|
||||
tmp |= OPAL_BIND_IF_SUPPORTED;
|
||||
} else if (0 == strcasecmp(quals[i], "overload-allowed")) {
|
||||
} else if (0 == strncasecmp(quals[i], "overload-allowed", strlen(quals[i])) ||
|
||||
0 == strncasecmp(quals[i], "oversubscribe-allowed", strlen(quals[i]))) {
|
||||
tmp |= OPAL_BIND_ALLOW_OVERLOAD;
|
||||
} else {
|
||||
/* unknown option */
|
||||
opal_output(0, "Unknown qualifier to orte_process_binding: %s", spec);
|
||||
opal_output(0, "Unknown qualifier to binding policy: %s", spec);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
}
|
||||
opal_argv_free(quals);
|
||||
}
|
||||
if (0 == strcasecmp(tmpvals[0], "hwthread")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_HWTHREAD);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "core")) {
|
||||
if (NULL == tmpvals[0]) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L1CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L2CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L3CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "socket")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_SOCKET);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "numa")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NUMA);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "board")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_BOARD);
|
||||
tmp &= ~OPAL_BIND_GIVEN;
|
||||
} else {
|
||||
opal_show_help("help-opal-hwloc-base.txt", "invalid binding_policy", true, "binding", spec);
|
||||
opal_argv_free(tmpvals);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
if (0 == strcasecmp(tmpvals[0], "hwthread")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_HWTHREAD);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "core")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L1CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L2CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L3CACHE);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "socket")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_SOCKET);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "numa")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NUMA);
|
||||
} else if (0 == strcasecmp(tmpvals[0], "board")) {
|
||||
OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_BOARD);
|
||||
} else {
|
||||
opal_show_help("help-opal-hwloc-base.txt", "invalid binding_policy", true, "binding", spec);
|
||||
opal_argv_free(tmpvals);
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
}
|
||||
opal_argv_free(tmpvals);
|
||||
}
|
||||
|
@ -303,4 +303,22 @@ not set by the mapper code:
|
||||
Please contact the OMPI developers for assistance. Meantime,
|
||||
you will still be able to run your application without binding
|
||||
by specifying "--bind-to none" on your command line.
|
||||
#
|
||||
[mapping-too-low-init]
|
||||
A request for multiple cpus-per-proc was given, but a directive
|
||||
was also give to map to an object level that cannot support that
|
||||
directive.
|
||||
|
||||
Please specify a mapping level that has more than one cpu, or
|
||||
else let us define a default mapping that will allow multiple
|
||||
cpus-per-proc.
|
||||
#
|
||||
[unrecog-modifier]
|
||||
A modifier was given to the --map-by directive that is not
|
||||
recognized:
|
||||
|
||||
Modifier: %s
|
||||
|
||||
Please see "mpirun --help" for a description of supported
|
||||
modifiers.
|
||||
|
||||
|
@ -285,8 +285,8 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
||||
if (1 < orte_rmaps_base.cpus_per_rank) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
|
||||
"--cpus-per-proc, -cpus-per-proc, --cpus-per-rank, -cpus-per-rank",
|
||||
"--map-by <obj>:PE=N",
|
||||
"rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N");
|
||||
"--map-by <obj>:PE=N, default <obj>=NUMA",
|
||||
"rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA");
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping,
|
||||
@ -407,6 +407,20 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
|
||||
}
|
||||
}
|
||||
/* we also need to ensure we are mapping to a high-enough level to have
|
||||
* multiple cpus beneath it - by default, we'll go to the NUMA level */
|
||||
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||
if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) >= ORTE_MAPPING_BYCORE) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low-init", true);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
} else {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"%s rmaps:base pe/rank set - setting mapping to BYNUMA",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNUMA);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -565,6 +579,10 @@ static int check_modifiers(char *ck, orte_mapping_policy_t *tmp)
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
orte_rmaps_base.cpus_per_rank = strtol(ptr, NULL, 10);
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"%s rmaps:base setting pe/rank to %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_rmaps_base.cpus_per_rank);
|
||||
found = true;
|
||||
} else if (0 == strncasecmp(ck2[i], "oversubscribe", strlen(ck2[i]))) {
|
||||
ORTE_UNSET_MAPPING_DIRECTIVE(*tmp, ORTE_MAPPING_NO_OVERSUBSCRIBE);
|
||||
@ -577,7 +595,8 @@ static int check_modifiers(char *ck, orte_mapping_policy_t *tmp)
|
||||
} else {
|
||||
/* unrecognized modifier */
|
||||
opal_argv_free(ck2);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
orte_show_help("help-orte-rmaps-base.txt", "unrecog-modifier", true, ck2[i]);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
opal_argv_free(ck2);
|
||||
@ -603,6 +622,11 @@ int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
|
||||
tmp = 0;
|
||||
*device = NULL;
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"%s rmaps:base set policy with %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == inspec) ? "NULL" : inspec);
|
||||
|
||||
if (NULL == inspec) {
|
||||
ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET);
|
||||
} else {
|
||||
@ -610,9 +634,28 @@ int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
|
||||
/* see if a colon was included - if so, then we have a policy + modifier */
|
||||
ck = strchr(spec, ':');
|
||||
if (NULL != ck) {
|
||||
/* if the colon is the first character of the string, then we
|
||||
* just have modifiers on the default mapping policy */
|
||||
if (ck == spec) {
|
||||
ck++;
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"%s rmaps:base only modifiers %s provided - assuming bysocket mapping",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ck);
|
||||
ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET);
|
||||
if (ORTE_ERR_SILENT == (rc = check_modifiers(ck, &tmp)) &&
|
||||
ORTE_ERR_BAD_PARAM != rc) {
|
||||
free(spec);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
free(spec);
|
||||
goto setpolicy;
|
||||
}
|
||||
/* split the string */
|
||||
*ck = '\0';
|
||||
ck++;
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"%s rmaps:base policy %s modifiers %s provided",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), spec, ck);
|
||||
/* if the policy is "dist", then we set the policy to that value
|
||||
* and save the second argument as the device
|
||||
*/
|
||||
@ -721,9 +764,7 @@ int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
|
||||
ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN);
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
setpolicy:
|
||||
#endif
|
||||
*policy = tmp;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user