1
1

By setting a default mapping/ranking/binding policy that wasn't "none", we introduced a problem for users of the Mac and any other machine where sockets aren't defined and/or binding is not supported. Fix that by checking to see if the user specified the failing policy - if not, then fall back to the old map/rank by slot and no binding.

Refs trac:3977

This commit was SVN r29933.

The following Trac tickets were found above:
  Ticket 3977 --> https://svn.open-mpi.org/trac/ompi/ticket/3977
Этот коммит содержится в:
Ralph Castain 2013-12-17 14:50:10 +00:00
родитель 0995a6f3b9
Коммит 53cd00fe16
4 изменённых файлов: 217 добавлений и 78 удалений

Просмотреть файл

@ -121,7 +121,8 @@ static int bind_upwards(orte_job_t *jdata,
*/
if (!support->cpubind->set_thisproc_cpubind &&
!support->cpubind->set_thisthread_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
@ -276,7 +277,8 @@ static int bind_downwards(orte_job_t *jdata,
*/
if (!support->cpubind->set_thisproc_cpubind &&
!support->cpubind->set_thisthread_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
@ -440,7 +442,8 @@ static int bind_in_place(orte_job_t *jdata,
*/
if (!support->cpubind->set_thisproc_cpubind &&
!support->cpubind->set_thisthread_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}

Просмотреть файл

@ -97,6 +97,9 @@ static int rank_span(orte_job_t *jdata,
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_span: found %d objects on node %s with %d procs",
num_objs, node->name, (int)node->num_procs);
if (0 == num_objs) {
return ORTE_ERR_NOT_SUPPORTED;
}
/* for each object */
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
@ -205,6 +208,9 @@ static int rank_fill(orte_job_t *jdata,
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_fill: found %d objects on node %s with %d procs",
num_objs, node->name, (int)node->num_procs);
if (0 == num_objs) {
return ORTE_ERR_NOT_SUPPORTED;
}
/* for each object */
for (i=0; i < num_objs && cnt < app->num_procs; i++) {
@ -321,6 +327,9 @@ static int rank_by(orte_job_t *jdata,
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps:rank_by: found %d objects on node %s with %d procs",
num_objs, node->name, (int)node->num_procs);
if (0 == num_objs) {
return ORTE_ERR_NOT_SUPPORTED;
}
/* collect all the objects */
for (i=0; i < num_objs; i++) {
obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
@ -420,6 +429,118 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
map = jdata->map;
/* start with the rank-by object options - if the object isn't
* included in the topology, then we obviously cannot rank by it.
* However, if this was the default ranking policy (as opposed to
* something given by the user), then fall back to rank-by slot
*/
#if OPAL_HAVE_HWLOC
if (ORTE_RANK_BY_NUMA == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by NUMA for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_NODE, 0))) {
if (ORTE_ERR_NOT_SUPPORTED == rc &&
!(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) {
ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT);
goto rankbyslot;
}
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_SOCKET == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by socket for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_SOCKET, 0))) {
if (ORTE_ERR_NOT_SUPPORTED == rc &&
!(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) {
ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT);
goto rankbyslot;
}
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L3CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by L3cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 3))) {
if (ORTE_ERR_NOT_SUPPORTED == rc &&
!(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) {
ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT);
goto rankbyslot;
}
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L2CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by L2cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 2))) {
if (ORTE_ERR_NOT_SUPPORTED == rc &&
!(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) {
ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT);
goto rankbyslot;
}
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L1CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by L1cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 1))) {
if (ORTE_ERR_NOT_SUPPORTED == rc &&
!(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) {
ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT);
goto rankbyslot;
}
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_CORE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by core for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CORE, 0))) {
if (ORTE_ERR_NOT_SUPPORTED == rc &&
!(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) {
ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT);
goto rankbyslot;
}
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_HWTHREAD == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by hwthread for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_PU, 0))) {
if (ORTE_ERR_NOT_SUPPORTED == rc &&
!(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) {
ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT);
goto rankbyslot;
}
ORTE_ERROR_LOG(rc);
}
return rc;
}
#endif
if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) ||
ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
@ -480,6 +601,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
return ORTE_SUCCESS;
}
rankbyslot:
if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) {
/* assign the ranks sequentially */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
@ -523,78 +645,6 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata,
return ORTE_SUCCESS;
}
#if OPAL_HAVE_HWLOC
if (ORTE_RANK_BY_NUMA == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by NUMA for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_SOCKET == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by socket for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L3CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by L3cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L2CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by L2cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_L1CACHE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by L1cache for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_CORE == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by core for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
if (ORTE_RANK_BY_HWTHREAD == ORTE_GET_RANKING_POLICY(map->ranking)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: computing ranks by hwthread for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_PU, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
#endif
return ORTE_ERR_NOT_IMPLEMENTED;
}

Просмотреть файл

@ -143,24 +143,80 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
} else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_PU, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CORE, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 1);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 2);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_CACHE, 3);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_SOCKET, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
} else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
app->num_procs, HWLOC_OBJ_NODE, 0);
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* if the mapper couldn't map by this object because
* it isn't available, but the error allows us to try
* byslot, then do so
*/
rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
app->num_procs);
}
#endif
} else {
/* unrecognized mapping directive */

Просмотреть файл

@ -539,12 +539,42 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
"mca:rmaps:rr:byobj: nprocs-to-assign %d for %d objs on node %s", num_procs_to_assign, nobjs, node->name);
/* if there are no objects of this type, then report the error
* and abort - this can happen, for example, on systems that
* don't report "sockets" as an independent object
* don't report "sockets" as an independent object. However, IF
* this object is the default one - i.e., not specified by the
* user - then we can fall back to mapping by slot
*/
if (0 == nobjs) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
true, hwloc_obj_type_string(target), node->name);
return ORTE_ERR_SILENT;
return ORTE_ERR_SILENT;
} else {
/* this was the default mapping policy, so clear the map
* of any prior work and indicate that map-by slot is reqd
*/
for (i=0; i < jdata->map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
continue;
}
for (idx=0; idx < node->procs->size; idx++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, idx))) {
continue;
}
if (proc->name.jobid != jdata->jobid) {
continue;
}
--node->num_procs;
OBJ_RELEASE(proc);
opal_pointer_array_set_item(node->procs, idx, NULL);
}
if (0 == node->num_procs) {
node->mapped = false;
OBJ_RELEASE(node);
opal_pointer_array_set_item(jdata->map->nodes, i, NULL);
}
}
return ORTE_ERR_NOT_SUPPORTED;
}
}
/* compute the number of procs to go on each object */