diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 9d18126c3b..01dd2afdd0 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -496,7 +496,7 @@ static int do_child(orte_app_context_t* context, msg = "failed to convert bitmap list to hwloc bitmap"; } if (OPAL_BINDING_REQUIRED(jobdat->map->binding) && - (OPAL_BIND_GIVEN & jobdat->map->binding)) { + OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) { /* If binding is required and a binding directive was explicitly * given (i.e., we are not binding due to a default policy), * send an error up the pipe (which exits -- it doesn't return). @@ -517,7 +517,7 @@ static int do_child(orte_app_context_t* context, /* bind as specified */ rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0); /* if we got an error and this wasn't a default binding policy, then report it */ - if (rc < 0 && (OPAL_BIND_GIVEN & jobdat->map->binding)) { + if (rc < 0 && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) { char *tmp = NULL; if (errno == ENOSYS) { msg = "hwloc indicates cpu binding not supported"; @@ -579,7 +579,7 @@ static int do_child(orte_app_context_t* context, * anything unless the user actually specified the binding policy */ rc = opal_hwloc_base_set_process_membind_policy(); - if (ORTE_SUCCESS != rc && (OPAL_BIND_GIVEN & jobdat->map->binding)) { + if (ORTE_SUCCESS != rc && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) { if (errno == ENOSYS) { msg = "hwloc indicates memory binding not supported"; } else if (errno == EXDEV) { diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c index aeec6e2ed1..a675c422b8 100644 --- a/orte/mca/rmaps/base/rmaps_base_binding.c +++ b/orte/mca/rmaps/base/rmaps_base_binding.c @@ -179,7 +179,7 @@ static int bind_upwards(orte_job_t *jdata, */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && - (OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) { + OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); @@ -294,7 +294,7 @@ static int bind_downwards(orte_job_t *jdata, */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { - if (OPAL_BIND_GIVEN & opal_hwloc_binding_policy) { + if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); @@ -382,8 +382,8 @@ static int bind_in_place(orte_job_t *jdata, */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { - if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) || - !(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) { + if (!OPAL_BINDING_REQUIRED(map->binding) || + !OPAL_BINDING_POLICY_IS_SET(map->binding)) { /* we are not required to bind, so ignore this */ continue; } @@ -400,7 +400,7 @@ static int bind_in_place(orte_job_t *jdata, */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && - (OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) { + OPAL_BINDING_POLICY_IS_SET(map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; @@ -416,7 +416,7 @@ static int bind_in_place(orte_job_t *jdata, * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ - if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) && + if (!OPAL_BINDING_POLICY_IS_SET(map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", @@ -466,7 +466,7 @@ static int bind_in_place(orte_job_t *jdata, */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && - (OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) { + OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); @@ -763,8 +763,8 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata) */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { - if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) || - !(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) { + if (!OPAL_BINDING_REQUIRED(jdata->map->binding) || + !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* we are not required to bind, so ignore this */ continue; } @@ -782,7 +782,7 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata) */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && - (OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) { + OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; @@ -799,7 +799,7 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata) * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ - if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) && + if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", diff --git a/orte/mca/rmaps/base/rmaps_base_frame.c b/orte/mca/rmaps/base/rmaps_base_frame.c index 0bf46c470e..04c60a1bdb 100644 --- a/orte/mca/rmaps/base/rmaps_base_frame.c +++ b/orte/mca/rmaps/base/rmaps_base_frame.c @@ -393,13 +393,7 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags) * bind to those cpus - any other binding policy is an * error */ - if (!(OPAL_BIND_GIVEN & OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy))) { - if (opal_hwloc_use_hwthreads_as_cpus) { - OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD); - } else { - OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE); - } - } else { + if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) { if (opal_hwloc_use_hwthreads_as_cpus) { if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, @@ -415,15 +409,21 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags) "bind-to core"); return ORTE_ERR_SILENT; } + } else { + if (opal_hwloc_use_hwthreads_as_cpus) { + OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD); + } else { + OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE); + } } #endif } if (orte_rmaps_base_pernode) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, - "--pernode, -pernode", "--map-by node:PPR=1", + "--pernode, -pernode", "--map-by ppr:1:node", "rmaps_base_pernode, rmaps_ppr_pernode", - "rmaps_base_mapping_policy=node:PPR=1"); + "rmaps_base_mapping_policy=ppr:1:node"); /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ @@ -441,9 +441,9 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags) if (0 < orte_rmaps_base_n_pernode) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, - "--npernode, -npernode", "--map-by node:PPR=N", + "--npernode, -npernode", "--map-by ppr:N:node", "rmaps_base_n_pernode, rmaps_ppr_n_pernode", - "rmaps_base_mapping_policy=node:PPR=N"); + "rmaps_base_mapping_policy=ppr:N:node"); /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ @@ -461,9 +461,9 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags) if (0 < orte_rmaps_base_n_persocket) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, - "--npersocket, -npersocket", "--map-by socket:PPR=N", + "--npersocket, -npersocket", "--map-by ppr:N:socket", "rmaps_base_n_persocket, rmaps_ppr_n_persocket", - "rmaps_base_mapping_policy=socket:PPR=N"); + "rmaps_base_mapping_policy=ppr:N:socket"); /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c index 82bc786b4f..00f70513bc 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -253,6 +253,8 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata, /* compute how many extra procs to put on each node */ balance = (float)(((int)app->num_procs - nprocs_mapped) - (navg * nnodes)) / (float)nnodes; extra_procs_to_assign = (int)balance; + nxtra_nodes = 0; + add_one = false; if (0 < (balance - (float)extra_procs_to_assign)) { /* compute how many nodes need an extra proc */ nxtra_nodes = ((int)app->num_procs - nprocs_mapped) - ((navg + extra_procs_to_assign) * nnodes); @@ -289,18 +291,21 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata, OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } - /* compute the number of procs to go on this node */ - if (add_one) { - if (0 == nxtra_nodes) { - --extra_procs_to_assign; - add_one = false; - } else { - --nxtra_nodes; - } - } if (oversubscribed) { + /* compute the number of procs to go on this node */ + if (add_one) { + if (0 == nxtra_nodes) { + --extra_procs_to_assign; + add_one = false; + } else { + --nxtra_nodes; + } + } /* everybody just takes their share */ num_procs_to_assign = navg + extra_procs_to_assign; + } else if (node->slots <= node->slots_inuse) { + /* since we are not oversubcribed, ignore this node */ + continue; } else { /* if we are not oversubscribed, then there are enough * slots to handle all the procs. However, not every @@ -308,44 +313,38 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata, * have to track how many procs to "shift" elsewhere * to make up the difference */ - if (node->slots <= node->slots_inuse) { - /* if there are no extras to take, then we can - * ignore this node - */ - num_procs_to_assign = 0; - /* update how many we are lagging behind */ - lag += navg + extra_procs_to_assign; - } else { - /* add in the extras */ - lag += extra_procs_to_assign; - /* if slots < avg (adjusted for cpus/proc), then take all */ - if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) { - num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank; - /* update how many we are lagging behind */ - lag += navg - num_procs_to_assign; - OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output, - "%s NODE %s LAGGING %d AVG %d ASSIGN %d EXTRA %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, - lag, navg, num_procs_to_assign, extra_procs_to_assign)); + + /* compute the number of procs to go on this node */ + if (add_one) { + if (0 == nxtra_nodes) { + --extra_procs_to_assign; + add_one = false; } else { - /* take the avg plus as much of the "lag" as we can */ - delta = 0; - if (0 < lag) { - delta = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) - navg; - if (lag < delta) { - delta = lag; - } - lag -= delta; - } - num_procs_to_assign = navg + delta; - OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output, - "%s NODE %s DELTA %d LAGGING %d AVG %d ASSIGN %d EXTRA %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, delta, - lag, navg, num_procs_to_assign, extra_procs_to_assign)); + --nxtra_nodes; } } - nnodes++; // track how many nodes remain available + /* add in the extras */ + lag += extra_procs_to_assign; + /* if slots < avg (adjusted for cpus/proc), then we can't put anything here */ + if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) { + continue; + } + /* take the avg plus as much of the "lag" as we can */ + delta = 0; + if (0 < lag) { + delta = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) - navg; + if (lag < delta) { + delta = lag; + } + lag -= delta; + } + num_procs_to_assign = navg + delta; + OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output, + "%s NODE %s DELTA %d LAGGING %d AVG %d ASSIGN %d EXTRA %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, delta, + lag, navg, num_procs_to_assign, extra_procs_to_assign)); } + nnodes++; // track how many nodes remain available OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output, "%s NODE %s ASSIGNING %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, @@ -386,6 +385,9 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata, obj = hwloc_get_root_obj(node->topology); } #endif + OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output, + "%s ADDING PROC TO NODE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; }