Based on Tetsuya's patch, with some changes, correct the case of map-by node where multiple cpus/rank are requested and result in a non-integer match with num slots. Also correct tests for binding policy given to use the proper macro.
Refs trac:4296 This commit was SVN r30857. The following Trac tickets were found above: Ticket 4296 --> https://svn.open-mpi.org/trac/ompi/ticket/4296
Этот коммит содержится в:
родитель
4572bd58e5
Коммит
61a21e4f31
@ -496,7 +496,7 @@ static int do_child(orte_app_context_t* context,
|
|||||||
msg = "failed to convert bitmap list to hwloc bitmap";
|
msg = "failed to convert bitmap list to hwloc bitmap";
|
||||||
}
|
}
|
||||||
if (OPAL_BINDING_REQUIRED(jobdat->map->binding) &&
|
if (OPAL_BINDING_REQUIRED(jobdat->map->binding) &&
|
||||||
(OPAL_BIND_GIVEN & jobdat->map->binding)) {
|
OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
|
||||||
/* If binding is required and a binding directive was explicitly
|
/* If binding is required and a binding directive was explicitly
|
||||||
* given (i.e., we are not binding due to a default policy),
|
* given (i.e., we are not binding due to a default policy),
|
||||||
* send an error up the pipe (which exits -- it doesn't return).
|
* send an error up the pipe (which exits -- it doesn't return).
|
||||||
@ -517,7 +517,7 @@ static int do_child(orte_app_context_t* context,
|
|||||||
/* bind as specified */
|
/* bind as specified */
|
||||||
rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0);
|
rc = hwloc_set_cpubind(opal_hwloc_topology, cpuset, 0);
|
||||||
/* if we got an error and this wasn't a default binding policy, then report it */
|
/* if we got an error and this wasn't a default binding policy, then report it */
|
||||||
if (rc < 0 && (OPAL_BIND_GIVEN & jobdat->map->binding)) {
|
if (rc < 0 && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
|
||||||
char *tmp = NULL;
|
char *tmp = NULL;
|
||||||
if (errno == ENOSYS) {
|
if (errno == ENOSYS) {
|
||||||
msg = "hwloc indicates cpu binding not supported";
|
msg = "hwloc indicates cpu binding not supported";
|
||||||
@ -579,7 +579,7 @@ static int do_child(orte_app_context_t* context,
|
|||||||
* anything unless the user actually specified the binding policy
|
* anything unless the user actually specified the binding policy
|
||||||
*/
|
*/
|
||||||
rc = opal_hwloc_base_set_process_membind_policy();
|
rc = opal_hwloc_base_set_process_membind_policy();
|
||||||
if (ORTE_SUCCESS != rc && (OPAL_BIND_GIVEN & jobdat->map->binding)) {
|
if (ORTE_SUCCESS != rc && OPAL_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
|
||||||
if (errno == ENOSYS) {
|
if (errno == ENOSYS) {
|
||||||
msg = "hwloc indicates memory binding not supported";
|
msg = "hwloc indicates memory binding not supported";
|
||||||
} else if (errno == EXDEV) {
|
} else if (errno == EXDEV) {
|
||||||
|
@ -179,7 +179,7 @@ static int bind_upwards(orte_job_t *jdata,
|
|||||||
*/
|
*/
|
||||||
if (ncpus < data->num_bound &&
|
if (ncpus < data->num_bound &&
|
||||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
|
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
|
||||||
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||||
data->num_bound, ncpus);
|
data->num_bound, ncpus);
|
||||||
@ -294,7 +294,7 @@ static int bind_downwards(orte_job_t *jdata,
|
|||||||
*/
|
*/
|
||||||
if (ncpus < data->num_bound &&
|
if (ncpus < data->num_bound &&
|
||||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
|
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
|
||||||
if (OPAL_BIND_GIVEN & opal_hwloc_binding_policy) {
|
if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||||
data->num_bound, ncpus);
|
data->num_bound, ncpus);
|
||||||
@ -382,8 +382,8 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
*/
|
*/
|
||||||
if (!support->cpubind->set_thisproc_cpubind &&
|
if (!support->cpubind->set_thisproc_cpubind &&
|
||||||
!support->cpubind->set_thisthread_cpubind) {
|
!support->cpubind->set_thisthread_cpubind) {
|
||||||
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
|
if (!OPAL_BINDING_REQUIRED(map->binding) ||
|
||||||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
!OPAL_BINDING_POLICY_IS_SET(map->binding)) {
|
||||||
/* we are not required to bind, so ignore this */
|
/* we are not required to bind, so ignore this */
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -400,7 +400,7 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
*/
|
*/
|
||||||
if (!support->membind->set_thisproc_membind &&
|
if (!support->membind->set_thisproc_membind &&
|
||||||
!support->membind->set_thisthread_membind &&
|
!support->membind->set_thisthread_membind &&
|
||||||
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
OPAL_BINDING_POLICY_IS_SET(map->binding)) {
|
||||||
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
||||||
membind_warned = true;
|
membind_warned = true;
|
||||||
@ -416,7 +416,7 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
* computing a binding due to our default policy, and no cores are found
|
* computing a binding due to our default policy, and no cores are found
|
||||||
* on this node, just silently skip it - we will not bind
|
* on this node, just silently skip it - we will not bind
|
||||||
*/
|
*/
|
||||||
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
|
if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
|
||||||
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
|
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"Unable to bind-to core by default on node %s as no cores detected",
|
"Unable to bind-to core by default on node %s as no cores detected",
|
||||||
@ -466,7 +466,7 @@ static int bind_in_place(orte_job_t *jdata,
|
|||||||
*/
|
*/
|
||||||
if (ncpus < data->num_bound &&
|
if (ncpus < data->num_bound &&
|
||||||
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
|
!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
|
||||||
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
|
||||||
opal_hwloc_base_print_binding(map->binding), node->name,
|
opal_hwloc_base_print_binding(map->binding), node->name,
|
||||||
data->num_bound, ncpus);
|
data->num_bound, ncpus);
|
||||||
@ -763,8 +763,8 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
|||||||
*/
|
*/
|
||||||
if (!support->cpubind->set_thisproc_cpubind &&
|
if (!support->cpubind->set_thisproc_cpubind &&
|
||||||
!support->cpubind->set_thisthread_cpubind) {
|
!support->cpubind->set_thisthread_cpubind) {
|
||||||
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
|
if (!OPAL_BINDING_REQUIRED(jdata->map->binding) ||
|
||||||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||||
/* we are not required to bind, so ignore this */
|
/* we are not required to bind, so ignore this */
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -782,7 +782,7 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
|||||||
*/
|
*/
|
||||||
if (!support->membind->set_thisproc_membind &&
|
if (!support->membind->set_thisproc_membind &&
|
||||||
!support->membind->set_thisthread_membind &&
|
!support->membind->set_thisthread_membind &&
|
||||||
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
|
OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
|
||||||
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
|
||||||
membind_warned = true;
|
membind_warned = true;
|
||||||
@ -799,7 +799,7 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
|||||||
* computing a binding due to our default policy, and no cores are found
|
* computing a binding due to our default policy, and no cores are found
|
||||||
* on this node, just silently skip it - we will not bind
|
* on this node, just silently skip it - we will not bind
|
||||||
*/
|
*/
|
||||||
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
|
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) &&
|
||||||
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
|
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
|
||||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||||
"Unable to bind-to core by default on node %s as no cores detected",
|
"Unable to bind-to core by default on node %s as no cores detected",
|
||||||
|
@ -393,13 +393,7 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
|||||||
* bind to those cpus - any other binding policy is an
|
* bind to those cpus - any other binding policy is an
|
||||||
* error
|
* error
|
||||||
*/
|
*/
|
||||||
if (!(OPAL_BIND_GIVEN & OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy))) {
|
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
|
||||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
|
||||||
} else {
|
|
||||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||||
if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true,
|
orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true,
|
||||||
@ -415,15 +409,21 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
|||||||
"bind-to core");
|
"bind-to core");
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||||
|
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
||||||
|
} else {
|
||||||
|
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (orte_rmaps_base_pernode) {
|
if (orte_rmaps_base_pernode) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
|
orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
|
||||||
"--pernode, -pernode", "--map-by node:PPR=1",
|
"--pernode, -pernode", "--map-by ppr:1:node",
|
||||||
"rmaps_base_pernode, rmaps_ppr_pernode",
|
"rmaps_base_pernode, rmaps_ppr_pernode",
|
||||||
"rmaps_base_mapping_policy=node:PPR=1");
|
"rmaps_base_mapping_policy=ppr:1:node");
|
||||||
/* there is no way to resolve this conflict, so if something else was
|
/* there is no way to resolve this conflict, so if something else was
|
||||||
* given, we have no choice but to error out
|
* given, we have no choice but to error out
|
||||||
*/
|
*/
|
||||||
@ -441,9 +441,9 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
|||||||
|
|
||||||
if (0 < orte_rmaps_base_n_pernode) {
|
if (0 < orte_rmaps_base_n_pernode) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
|
orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
|
||||||
"--npernode, -npernode", "--map-by node:PPR=N",
|
"--npernode, -npernode", "--map-by ppr:N:node",
|
||||||
"rmaps_base_n_pernode, rmaps_ppr_n_pernode",
|
"rmaps_base_n_pernode, rmaps_ppr_n_pernode",
|
||||||
"rmaps_base_mapping_policy=node:PPR=N");
|
"rmaps_base_mapping_policy=ppr:N:node");
|
||||||
/* there is no way to resolve this conflict, so if something else was
|
/* there is no way to resolve this conflict, so if something else was
|
||||||
* given, we have no choice but to error out
|
* given, we have no choice but to error out
|
||||||
*/
|
*/
|
||||||
@ -461,9 +461,9 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
|||||||
|
|
||||||
if (0 < orte_rmaps_base_n_persocket) {
|
if (0 < orte_rmaps_base_n_persocket) {
|
||||||
orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
|
orte_show_help("help-orte-rmaps-base.txt", "deprecated", true,
|
||||||
"--npersocket, -npersocket", "--map-by socket:PPR=N",
|
"--npersocket, -npersocket", "--map-by ppr:N:socket",
|
||||||
"rmaps_base_n_persocket, rmaps_ppr_n_persocket",
|
"rmaps_base_n_persocket, rmaps_ppr_n_persocket",
|
||||||
"rmaps_base_mapping_policy=socket:PPR=N");
|
"rmaps_base_mapping_policy=ppr:N:socket");
|
||||||
/* there is no way to resolve this conflict, so if something else was
|
/* there is no way to resolve this conflict, so if something else was
|
||||||
* given, we have no choice but to error out
|
* given, we have no choice but to error out
|
||||||
*/
|
*/
|
||||||
|
@ -253,6 +253,8 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
|||||||
/* compute how many extra procs to put on each node */
|
/* compute how many extra procs to put on each node */
|
||||||
balance = (float)(((int)app->num_procs - nprocs_mapped) - (navg * nnodes)) / (float)nnodes;
|
balance = (float)(((int)app->num_procs - nprocs_mapped) - (navg * nnodes)) / (float)nnodes;
|
||||||
extra_procs_to_assign = (int)balance;
|
extra_procs_to_assign = (int)balance;
|
||||||
|
nxtra_nodes = 0;
|
||||||
|
add_one = false;
|
||||||
if (0 < (balance - (float)extra_procs_to_assign)) {
|
if (0 < (balance - (float)extra_procs_to_assign)) {
|
||||||
/* compute how many nodes need an extra proc */
|
/* compute how many nodes need an extra proc */
|
||||||
nxtra_nodes = ((int)app->num_procs - nprocs_mapped) - ((navg + extra_procs_to_assign) * nnodes);
|
nxtra_nodes = ((int)app->num_procs - nprocs_mapped) - ((navg + extra_procs_to_assign) * nnodes);
|
||||||
@ -289,18 +291,21 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
|||||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||||
++(jdata->map->num_nodes);
|
++(jdata->map->num_nodes);
|
||||||
}
|
}
|
||||||
/* compute the number of procs to go on this node */
|
|
||||||
if (add_one) {
|
|
||||||
if (0 == nxtra_nodes) {
|
|
||||||
--extra_procs_to_assign;
|
|
||||||
add_one = false;
|
|
||||||
} else {
|
|
||||||
--nxtra_nodes;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (oversubscribed) {
|
if (oversubscribed) {
|
||||||
|
/* compute the number of procs to go on this node */
|
||||||
|
if (add_one) {
|
||||||
|
if (0 == nxtra_nodes) {
|
||||||
|
--extra_procs_to_assign;
|
||||||
|
add_one = false;
|
||||||
|
} else {
|
||||||
|
--nxtra_nodes;
|
||||||
|
}
|
||||||
|
}
|
||||||
/* everybody just takes their share */
|
/* everybody just takes their share */
|
||||||
num_procs_to_assign = navg + extra_procs_to_assign;
|
num_procs_to_assign = navg + extra_procs_to_assign;
|
||||||
|
} else if (node->slots <= node->slots_inuse) {
|
||||||
|
/* since we are not oversubcribed, ignore this node */
|
||||||
|
continue;
|
||||||
} else {
|
} else {
|
||||||
/* if we are not oversubscribed, then there are enough
|
/* if we are not oversubscribed, then there are enough
|
||||||
* slots to handle all the procs. However, not every
|
* slots to handle all the procs. However, not every
|
||||||
@ -308,44 +313,38 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
|||||||
* have to track how many procs to "shift" elsewhere
|
* have to track how many procs to "shift" elsewhere
|
||||||
* to make up the difference
|
* to make up the difference
|
||||||
*/
|
*/
|
||||||
if (node->slots <= node->slots_inuse) {
|
|
||||||
/* if there are no extras to take, then we can
|
/* compute the number of procs to go on this node */
|
||||||
* ignore this node
|
if (add_one) {
|
||||||
*/
|
if (0 == nxtra_nodes) {
|
||||||
num_procs_to_assign = 0;
|
--extra_procs_to_assign;
|
||||||
/* update how many we are lagging behind */
|
add_one = false;
|
||||||
lag += navg + extra_procs_to_assign;
|
|
||||||
} else {
|
|
||||||
/* add in the extras */
|
|
||||||
lag += extra_procs_to_assign;
|
|
||||||
/* if slots < avg (adjusted for cpus/proc), then take all */
|
|
||||||
if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) {
|
|
||||||
num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank;
|
|
||||||
/* update how many we are lagging behind */
|
|
||||||
lag += navg - num_procs_to_assign;
|
|
||||||
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
|
|
||||||
"%s NODE %s LAGGING %d AVG %d ASSIGN %d EXTRA %d",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
|
||||||
lag, navg, num_procs_to_assign, extra_procs_to_assign));
|
|
||||||
} else {
|
} else {
|
||||||
/* take the avg plus as much of the "lag" as we can */
|
--nxtra_nodes;
|
||||||
delta = 0;
|
|
||||||
if (0 < lag) {
|
|
||||||
delta = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) - navg;
|
|
||||||
if (lag < delta) {
|
|
||||||
delta = lag;
|
|
||||||
}
|
|
||||||
lag -= delta;
|
|
||||||
}
|
|
||||||
num_procs_to_assign = navg + delta;
|
|
||||||
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
|
|
||||||
"%s NODE %s DELTA %d LAGGING %d AVG %d ASSIGN %d EXTRA %d",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, delta,
|
|
||||||
lag, navg, num_procs_to_assign, extra_procs_to_assign));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
nnodes++; // track how many nodes remain available
|
/* add in the extras */
|
||||||
|
lag += extra_procs_to_assign;
|
||||||
|
/* if slots < avg (adjusted for cpus/proc), then we can't put anything here */
|
||||||
|
if ((node->slots - node->slots_inuse) < (navg * orte_rmaps_base.cpus_per_rank)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* take the avg plus as much of the "lag" as we can */
|
||||||
|
delta = 0;
|
||||||
|
if (0 < lag) {
|
||||||
|
delta = ((node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank) - navg;
|
||||||
|
if (lag < delta) {
|
||||||
|
delta = lag;
|
||||||
|
}
|
||||||
|
lag -= delta;
|
||||||
|
}
|
||||||
|
num_procs_to_assign = navg + delta;
|
||||||
|
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
|
||||||
|
"%s NODE %s DELTA %d LAGGING %d AVG %d ASSIGN %d EXTRA %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, delta,
|
||||||
|
lag, navg, num_procs_to_assign, extra_procs_to_assign));
|
||||||
}
|
}
|
||||||
|
nnodes++; // track how many nodes remain available
|
||||||
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
|
||||||
"%s NODE %s ASSIGNING %d",
|
"%s NODE %s ASSIGNING %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
|
||||||
@ -386,6 +385,9 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
|
|||||||
obj = hwloc_get_root_obj(node->topology);
|
obj = hwloc_get_root_obj(node->topology);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
|
||||||
|
"%s ADDING PROC TO NODE %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
|
||||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
|
||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user