From 9ae80596f653155516251c98a402e907106383d2 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 25 Apr 2018 11:25:39 -0700 Subject: [PATCH] Fix rank-by option and improve npernode/skt This fixes a problem reported by @bgoglin where rank-by was incorrectly generating values when ranking by a type of object (e.g., socket). It also corrects the handling of the pernode, npernode, and npersocket options - these should only set the #procs and the default mapping pattern. They specifically should not prohibit the user from requesting a different mapping. Thus, the following should be valid: mpirun -npernode 2 --map-by socket ... should put 2 procs on each node, mapping them by-socket on each node. Signed-off-by: Ralph Castain --- orte/mca/rmaps/base/rmaps_base_frame.c | 61 +++++++------------ orte/mca/rmaps/base/rmaps_base_map_job.c | 42 ++++++++----- orte/mca/rmaps/base/rmaps_base_ranking.c | 4 +- orte/mca/rmaps/round_robin/rmaps_rr_mappers.c | 48 +++++++++++++-- 4 files changed, 94 insertions(+), 61 deletions(-) diff --git a/orte/mca/rmaps/base/rmaps_base_frame.c b/orte/mca/rmaps/base/rmaps_base_frame.c index df63ed7963..1210e05e89 100644 --- a/orte/mca/rmaps/base/rmaps_base_frame.c +++ b/orte/mca/rmaps/base/rmaps_base_frame.c @@ -12,7 +12,7 @@ * Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -427,51 +427,36 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags) } if (orte_rmaps_base_pernode) { - /* there is no way to resolve this conflict, so if something else was - * given, we have no choice but to error out - */ - if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { - orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", - "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); - return ORTE_ERR_SILENT; + /* if the user didn't specify a mapping directive, then match it */ + if (!(ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { + /* ensure we set the mapping policy to ppr */ + ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); + /* define the ppr */ + orte_rmaps_base.ppr = strdup("1:node"); } - /* ensure we set the mapping policy to ppr */ - ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); - ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); - /* define the ppr */ - orte_rmaps_base.ppr = strdup("1:node"); } if (0 < orte_rmaps_base_n_pernode) { - /* there is no way to resolve this conflict, so if something else was - * given, we have no choice but to error out - */ - if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { - orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", - "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); - return ORTE_ERR_SILENT; - } - /* ensure we set the mapping policy to ppr */ - ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); - ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); - /* define the ppr */ - asprintf(&orte_rmaps_base.ppr, "%d:node", orte_rmaps_base_n_pernode); + /* if the user didn't specify a mapping directive, then match it */ + if (!(ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { + /* ensure we set the mapping policy to ppr */ + ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); + /* define the ppr */ + asprintf(&orte_rmaps_base.ppr, "%d:node", orte_rmaps_base_n_pernode); + } } if (0 < orte_rmaps_base_n_persocket) { - /* there is no way to resolve this conflict, so if something else was - * given, we have no choice but to error out - */ - if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { - orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", - "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); - return ORTE_ERR_SILENT; + /* if the user didn't specify a mapping directive, then match it */ + if (!(ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { + /* ensure we set the mapping policy to ppr */ + ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); + /* define the ppr */ + asprintf(&orte_rmaps_base.ppr, "%d:socket", orte_rmaps_base_n_persocket); } - /* ensure we set the mapping policy to ppr */ - ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); - ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); - /* define the ppr */ - asprintf(&orte_rmaps_base.ppr, "%d:socket", orte_rmaps_base_n_persocket); } /* Should we schedule on the local node or not? */ diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 925c2305db..6fdc6cc1a1 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -49,7 +49,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) orte_job_t *jdata; orte_node_t *node; int rc, i, ppx = 0; - bool did_map, given, pernode = false; + bool did_map, given, pernode = false, persocket = false; orte_rmaps_base_selected_module_t *mod; orte_job_t *parent; orte_vpid_t nprocs; @@ -75,6 +75,17 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) } else { pernode = false; } + } else { + if (orte_rmaps_base_pernode) { + ppx = 1; + pernode = true; + } else if (0 < orte_rmaps_base_n_pernode) { + ppx = orte_rmaps_base_n_pernode; + pernode = true; + } else if (0 < orte_rmaps_base_n_persocket) { + ppx = orte_rmaps_base_n_persocket; + persocket = true; + } } if (0 == jdata->map->cpus_per_rank) { jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; @@ -89,18 +100,15 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) orte_std_cntr_t slots; OBJ_CONSTRUCT(&nodes, opal_list_t); orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true); - if (NULL != jdata->map->ppr) { - if (pernode) { - nprocs += ppx * opal_list_get_size(&nodes); - } else { - /* must be procs/socket, so add in #sockets for each node */ - slots = 0; - OPAL_LIST_FOREACH(node, &nodes, orte_node_t) { - slots += ppx * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_SOCKET, 0, - OPAL_HWLOC_AVAILABLE); - } - nprocs += slots; + slots = 0; + if (pernode) { + slots = ppx * opal_list_get_size(&nodes); + } else if (persocket) { + /* add in #sockets for each node */ + OPAL_LIST_FOREACH(node, &nodes, orte_node_t) { + slots += ppx * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, + HWLOC_OBJ_SOCKET, 0, + OPAL_HWLOC_AVAILABLE); } } else { /* if we are in a managed allocation, then all is good - otherwise, @@ -126,13 +134,15 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); return; } + OPAL_LIST_FOREACH(node, &nodes, orte_node_t) { + slots += node->slots; + } } - nprocs += slots; } + app->num_procs = slots; OPAL_LIST_DESTRUCT(&nodes); - } else { - nprocs += app->num_procs; } + nprocs += app->num_procs; } } diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index 9eaea79ccf..a2eb75b0f9 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -378,9 +378,9 @@ static int rank_by(orte_job_t *jdata, * algorithm, but this works for now. */ i = 0; - while (cnt < app->num_procs) { + while (cnt < app->num_procs && i < node->num_procs) { /* get the next object */ - obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i % num_objs); if (NULL == obj) { break; } diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index 9cc335e307..dda356208b 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -85,8 +85,16 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata, node->name); continue; } - /* assign a number of procs equal to the number of available slots */ - num_procs_to_assign = node->slots - node->slots_inuse; + if (orte_rmaps_base_pernode) { + num_procs_to_assign = 1; + } else if (0 < orte_rmaps_base_n_pernode) { + num_procs_to_assign = orte_rmaps_base_n_pernode; + } else if (0 < orte_rmaps_base_n_persocket) { + num_procs_to_assign = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE); + } else { + /* assign a number of procs equal to the number of available slots */ + num_procs_to_assign = node->slots - node->slots_inuse; + } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot assigning %d procs to node %s", (int)num_procs_to_assign, node->name); @@ -292,7 +300,13 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata, opal_pointer_array_add(jdata->map->nodes, node); ++(jdata->map->num_nodes); } - if (oversubscribed) { + if (orte_rmaps_base_pernode) { + num_procs_to_assign = 1; + } else if (0 < orte_rmaps_base_n_pernode) { + num_procs_to_assign = orte_rmaps_base_n_pernode; + } else if (0 < orte_rmaps_base_n_persocket) { + num_procs_to_assign = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE); + } else if (oversubscribed) { /* compute the number of procs to go on this node */ if (add_one) { if (0 == nxtra_nodes) { @@ -518,7 +532,19 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata, start = (jdata->bkmark_obj + 1) % nobjs; } /* compute the number of procs to go on this node */ - nprocs = node->slots - node->slots_inuse; + if (orte_rmaps_base_pernode) { + nprocs = 1; + } else if (0 < orte_rmaps_base_n_pernode) { + nprocs = orte_rmaps_base_n_pernode; + } else if (0 < orte_rmaps_base_n_persocket) { + if (HWLOC_OBJ_PACKAGE == target) { + nprocs = orte_rmaps_base_n_persocket * nobjs; + } else { + nprocs = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE); + } + } else { + nprocs = node->slots - node->slots_inuse; + } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: calculated nprocs %d", nprocs); if (nprocs < 1) { @@ -708,7 +734,19 @@ static int byobj_span(orte_job_t *jdata, return ORTE_ERR_SILENT; } /* determine how many to map */ - nprocs = navg; + if (orte_rmaps_base_pernode) { + nprocs = 1; + } else if (0 < orte_rmaps_base_n_pernode) { + nprocs = orte_rmaps_base_n_pernode; + } else if (0 < orte_rmaps_base_n_persocket) { + if (HWLOC_OBJ_PACKAGE == target) { + nprocs = orte_rmaps_base_n_persocket * nobjs; + } else { + nprocs = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE); + } + } else { + nprocs = navg; + } if (0 < nxtra_objs) { nprocs++; nxtra_objs--;