Modify the OMPI paffinity and mapping system to support socket-level mapping and binding. Mostly refactors existing code, with modifications to the odls_default module to support the new capabilities.
Adds several new mpirun options: * -bysocket - assign ranks on a node by socket. Effectively load balances the procs assigned to a node across the available sockets. Note that ranks can still be bound to a specific core within the socket, or to the entire socket - the mapping is independent of the binding. * -bind-to-socket - bind each rank to all the cores on the socket to which they are assigned. * -bind-to-core - currently the default behavior (maintained from prior default) * -npersocket N - launch N procs for every socket on a node. Note that this implies we know how many sockets are on a node. Mpirun will determine its local values. These can be overridden by provided values, either via MCA param or in a hostfile Similar features/options are provided at the board level for multi-board nodes. Documentation to follow... This commit was SVN r21791.
Этот коммит содержится в:
родитель
007cbe74f4
Коммит
1dc12046f1
opal/mca/paffinity
orte
mca
odls
plm/base
rmaps
base
base.hrmaps_base_common_mappers.crmaps_base_map_job.crmaps_base_open.crmaps_base_support_fns.crmaps_private.h
load_balance
rank_file
resilient
rmaps_types.hround_robin
seq
topo
runtime
tools/orterun
util
@ -108,6 +108,11 @@
|
|||||||
#define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU)
|
#define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU)
|
||||||
#define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER)
|
#define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER)
|
||||||
|
|
||||||
|
/* Process binding modes */
|
||||||
|
#define OPAL_PAFFINITY_DO_NOT_BIND 0x01
|
||||||
|
#define OPAL_PAFFINITY_BIND_TO_CORE 0x02
|
||||||
|
#define OPAL_PAFFINITY_BIND_TO_SOCKET 0x04
|
||||||
|
#define OPAL_PAFFINITY_BIND_TO_BOARD 0x08
|
||||||
/* ******************************************************************** */
|
/* ******************************************************************** */
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,6 +54,7 @@
|
|||||||
#include "orte/mca/ess/base/base.h"
|
#include "orte/mca/ess/base/base.h"
|
||||||
#include "orte/mca/plm/base/base.h"
|
#include "orte/mca/plm/base/base.h"
|
||||||
#include "orte/mca/routed/base/base.h"
|
#include "orte/mca/routed/base/base.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps_types.h"
|
||||||
|
|
||||||
#include "orte/util/context_fns.h"
|
#include "orte/util/context_fns.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
@ -326,6 +327,24 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* pack the map & binding policy for this job */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* pack the cpus_per_rank for this job */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->cpus_per_rank, 1, OPAL_INT16))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* pack the stride for this job */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->stride, 1, OPAL_INT16))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
/* pack the control flags for this job */
|
/* pack the control flags for this job */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -744,6 +763,24 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto REPORT_ERROR;
|
goto REPORT_ERROR;
|
||||||
}
|
}
|
||||||
|
/* unpack the mapping policy for the job */
|
||||||
|
cnt=1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto REPORT_ERROR;
|
||||||
|
}
|
||||||
|
/* unpack the cpus/rank for the job */
|
||||||
|
cnt=1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->cpus_per_rank, &cnt, OPAL_INT16))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto REPORT_ERROR;
|
||||||
|
}
|
||||||
|
/* unpack the stride for the job */
|
||||||
|
cnt=1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stride, &cnt, OPAL_INT16))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto REPORT_ERROR;
|
||||||
|
}
|
||||||
/* unpack the control flags for the job */
|
/* unpack the control flags for the job */
|
||||||
cnt=1;
|
cnt=1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) {
|
||||||
@ -1745,7 +1782,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = fork_local(app, child, app->env, jobdat->controls, jobdat->stdin_target);
|
rc = fork_local(app, child, app->env, jobdat);
|
||||||
/* reaquire lock so we don't double unlock... */
|
/* reaquire lock so we don't double unlock... */
|
||||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
@ -1791,12 +1828,22 @@ CLEANUP:
|
|||||||
"%s odls:launch reporting job %s launch status",
|
"%s odls:launch reporting job %s launch status",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_JOBID_PRINT(job)));
|
ORTE_JOBID_PRINT(job)));
|
||||||
/* pack the launch results */
|
|
||||||
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!launch_failed) {
|
/* if the launch failed, we need to flag all the procs from this job
|
||||||
|
* that didn't launch as having failed, or else we will hang
|
||||||
|
*/
|
||||||
|
if (launch_failed) {
|
||||||
|
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||||
|
for (item = opal_list_get_first(&orte_local_children);
|
||||||
|
item != opal_list_get_end(&orte_local_children);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
child = (orte_odls_child_t*)item;
|
||||||
|
if (child->name->jobid == jobdat->jobid &&
|
||||||
|
ORTE_PROC_STATE_LAUNCHED >= child->state) {
|
||||||
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
/* if the launch succeeded, check to see if we need to
|
/* if the launch succeeded, check to see if we need to
|
||||||
* co-locate any debugger daemons so that they get launched
|
* co-locate any debugger daemons so that they get launched
|
||||||
* before we report anything to the HNP. This ensures that
|
* before we report anything to the HNP. This ensures that
|
||||||
@ -1813,11 +1860,14 @@ CLEANUP:
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
(ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls) ? "output forwarded" : "no output"));
|
(ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls) ? "output forwarded" : "no output"));
|
||||||
|
|
||||||
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL,
|
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL, orte_odls_globals.debugger);
|
||||||
orte_odls_globals.debugger->controls, ORTE_VPID_INVALID);
|
|
||||||
orte_odls_globals.debugger_launched = true;
|
orte_odls_globals.debugger_launched = true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* pack the launch results */
|
||||||
|
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we are the HNP, then we would rather not send this to ourselves -
|
/* if we are the HNP, then we would rather not send this to ourselves -
|
||||||
|
@ -107,6 +107,9 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
|||||||
ptr->launch_msg_processed = false;
|
ptr->launch_msg_processed = false;
|
||||||
ptr->apps = NULL;
|
ptr->apps = NULL;
|
||||||
ptr->num_apps = 0;
|
ptr->num_apps = 0;
|
||||||
|
ptr->policy = 0;
|
||||||
|
ptr->cpus_per_rank = 1;
|
||||||
|
ptr->stride = 1;
|
||||||
ptr->controls = 0;
|
ptr->controls = 0;
|
||||||
ptr->stdin_target = ORTE_VPID_INVALID;
|
ptr->stdin_target = ORTE_VPID_INVALID;
|
||||||
ptr->total_slots_alloc = 0;
|
ptr->total_slots_alloc = 0;
|
||||||
@ -232,6 +235,12 @@ int orte_odls_base_open(void)
|
|||||||
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
|
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* see if the user wants us to report bindings */
|
||||||
|
mca_base_param_reg_int_name("odls", "base_report_bindings",
|
||||||
|
"Report process bindings [default: no]",
|
||||||
|
false, false, (int)false, &i);
|
||||||
|
orte_odls_globals.report_bindings = OPAL_INT_TO_BOOL(i);
|
||||||
|
|
||||||
/* Open up all available components */
|
/* Open up all available components */
|
||||||
|
|
||||||
if (ORTE_SUCCESS !=
|
if (ORTE_SUCCESS !=
|
||||||
|
@ -64,6 +64,8 @@ typedef struct {
|
|||||||
opal_list_t xterm_ranks;
|
opal_list_t xterm_ranks;
|
||||||
/* the xterm cmd to be used */
|
/* the xterm cmd to be used */
|
||||||
char **xtermcmd;
|
char **xtermcmd;
|
||||||
|
/* whether or not to report bindings */
|
||||||
|
bool report_bindings;
|
||||||
} orte_odls_globals_t;
|
} orte_odls_globals_t;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
|
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
|
||||||
@ -89,8 +91,7 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context,
|
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context,
|
||||||
orte_odls_child_t *child,
|
orte_odls_child_t *child,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_controls_t controls,
|
orte_odls_job_t *jobdat);
|
||||||
orte_vpid_t stdin_target);
|
|
||||||
|
|
||||||
ORTE_DECLSPEC int
|
ORTE_DECLSPEC int
|
||||||
orte_odls_base_default_launch_local(orte_jobid_t job,
|
orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||||
|
@ -78,6 +78,10 @@ that the specification had improper syntax.
|
|||||||
An invalid node rank was obtained - this is probably something
|
An invalid node rank was obtained - this is probably something
|
||||||
that should be reported to the OMPI developers.
|
that should be reported to the OMPI developers.
|
||||||
#
|
#
|
||||||
|
[odls-default:invalid-local-rank]
|
||||||
|
An invalid local rank was obtained - this is probably something
|
||||||
|
that should be reported to the OMPI developers.
|
||||||
|
#
|
||||||
[odls-default:invalid-phys-cpu]
|
[odls-default:invalid-phys-cpu]
|
||||||
An invalid physical processor id was returned when attempting to
|
An invalid physical processor id was returned when attempting to
|
||||||
set processor affinity. This is probably something that should be
|
set processor affinity. This is probably something that should be
|
||||||
|
@ -43,6 +43,9 @@ int orte_odls_default_component_query(mca_base_module_t **module, int *priority)
|
|||||||
extern orte_odls_base_module_t orte_odls_default_module;
|
extern orte_odls_base_module_t orte_odls_default_module;
|
||||||
ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_default_component;
|
ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_default_component;
|
||||||
|
|
||||||
|
/* dedicated debug output flag */
|
||||||
|
ORTE_MODULE_DECLSPEC extern bool orte_odls_default_report_bindings;
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
#endif /* ORTE_ODLS_H */
|
#endif /* ORTE_ODLS_H */
|
||||||
|
@ -35,6 +35,9 @@
|
|||||||
#include "orte/mca/odls/base/odls_private.h"
|
#include "orte/mca/odls/base/odls_private.h"
|
||||||
#include "orte/mca/odls/default/odls_default.h"
|
#include "orte/mca/odls/default/odls_default.h"
|
||||||
|
|
||||||
|
/* instantiate a module-global variable */
|
||||||
|
bool orte_odls_default_report_bindings;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Instantiate the public struct with all of our public information
|
* Instantiate the public struct with all of our public information
|
||||||
* and pointers to our public functions in it
|
* and pointers to our public functions in it
|
||||||
@ -66,7 +69,6 @@ orte_odls_base_component_t mca_odls_default_component = {
|
|||||||
|
|
||||||
int orte_odls_default_component_open(void)
|
int orte_odls_default_component_open(void)
|
||||||
{
|
{
|
||||||
/* nothing to do */
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -176,8 +176,7 @@ int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_sta
|
|||||||
static int odls_default_fork_local_proc(orte_app_context_t* context,
|
static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||||
orte_odls_child_t *child,
|
orte_odls_child_t *child,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_controls_t controls,
|
orte_odls_job_t *jobdat)
|
||||||
orte_vpid_t stdin_target)
|
|
||||||
{
|
{
|
||||||
orte_iof_base_io_conf_t opts;
|
orte_iof_base_io_conf_t opts;
|
||||||
int rc;
|
int rc;
|
||||||
@ -185,6 +184,12 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
|||||||
int i, p[2];
|
int i, p[2];
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
bool paffinity_enabled = false;
|
bool paffinity_enabled = false;
|
||||||
|
opal_paffinity_base_cpu_set_t mask;
|
||||||
|
orte_node_rank_t nrank;
|
||||||
|
int16_t n;
|
||||||
|
orte_local_rank_t lrank;
|
||||||
|
int target_socket, npersocket;
|
||||||
|
int logical_cpu, phys_core, phys_cpu;
|
||||||
|
|
||||||
if (NULL != child) {
|
if (NULL != child) {
|
||||||
/* should pull this information from MPIRUN instead of going with
|
/* should pull this information from MPIRUN instead of going with
|
||||||
@ -193,7 +198,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
|||||||
|
|
||||||
/* do we want to setup stdin? */
|
/* do we want to setup stdin? */
|
||||||
if (NULL != child &&
|
if (NULL != child &&
|
||||||
(stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target)) {
|
(jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) {
|
||||||
opts.connect_stdin = true;
|
opts.connect_stdin = true;
|
||||||
} else {
|
} else {
|
||||||
opts.connect_stdin = false;
|
opts.connect_stdin = false;
|
||||||
@ -291,39 +296,144 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Otherwise, if opal_paffinity_alone was set, use that scheme */
|
/* Otherwise, if opal_paffinity_alone was set and a binding is specified, use that scheme */
|
||||||
else if (opal_paffinity_alone) {
|
else if (opal_paffinity_alone && !(ORTE_BIND_TO_NONE & jobdat->policy)) {
|
||||||
opal_paffinity_base_cpu_set_t mask;
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
int phys_cpu;
|
"%s odls:default:fork setting paffinity for child %s using policy %04x",
|
||||||
orte_node_rank_t nrank;
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
|
||||||
"%s odls:default:fork setting paffinity for child %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(child->name)));
|
ORTE_NAME_PRINT(child->name), jobdat->policy));
|
||||||
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
|
if (ORTE_BIND_TO_CORE & jobdat->policy) {
|
||||||
orte_show_help("help-odls-default.txt",
|
/* we want to bind this proc to a specific core, or multiple cores
|
||||||
"odls-default:invalid-node-rank", true);
|
* if the cpus_per_rank is > 0
|
||||||
rc = ORTE_ERR_FATAL;
|
*/
|
||||||
write(p[1], &rc, sizeof(int));
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
exit(1);
|
"%s odls:default:fork binding child %s to core(s) cpus/rank %d stride %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name),
|
||||||
|
(int)jobdat->cpus_per_rank, (int)jobdat->stride));
|
||||||
|
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
|
||||||
|
orte_show_help("help-odls-default.txt",
|
||||||
|
"odls-default:invalid-node-rank", true);
|
||||||
|
rc = ORTE_ERR_FATAL;
|
||||||
|
write(p[1], &rc, sizeof(int));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||||
|
/* my starting core has to be offset by cpus_per_rank */
|
||||||
|
logical_cpu = nrank * jobdat->cpus_per_rank;
|
||||||
|
for (n=0; n < jobdat->cpus_per_rank; n++) {
|
||||||
|
phys_cpu = opal_paffinity_base_get_physical_processor_id(logical_cpu);
|
||||||
|
if (0 > phys_cpu) {
|
||||||
|
orte_show_help("help-odls-default.txt",
|
||||||
|
"odls-default:invalid-phys-cpu", true);
|
||||||
|
rc = ORTE_ERR_FATAL;
|
||||||
|
write(p[1], &rc, sizeof(int));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||||
|
logical_cpu += jobdat->stride;
|
||||||
|
}
|
||||||
|
if (orte_odls_globals.report_bindings) {
|
||||||
|
opal_output(0, "%s odls:default:fork binding child %s to cpus %04lx",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name), mask.bitmask[0]);
|
||||||
|
}
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
||||||
|
orte_show_help("help-odls-default.txt",
|
||||||
|
"odls-default:failed-set-paff", true);
|
||||||
|
write(p[1], &rc, sizeof(int));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
paffinity_enabled = true;
|
||||||
|
} else if (ORTE_BIND_TO_SOCKET & jobdat->policy) {
|
||||||
|
/* bind this proc to a socket */
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:default:fork binding child %s to socket",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
/* layout this process across the sockets based on
|
||||||
|
* the provided mapping policy
|
||||||
|
*/
|
||||||
|
if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) {
|
||||||
|
orte_show_help("help-odls-default.txt",
|
||||||
|
"odls-default:invalid-local-rank", true);
|
||||||
|
rc = ORTE_ERR_FATAL;
|
||||||
|
write(p[1], &rc, sizeof(int));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
|
||||||
|
/* we need to balance the children from this job across the sockets */
|
||||||
|
npersocket = jobdat->num_local_procs / orte_default_num_sockets_per_board;
|
||||||
|
if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||||
|
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % npersocket);
|
||||||
|
} else {
|
||||||
|
target_socket = opal_paffinity_base_get_physical_socket_id(lrank / npersocket);
|
||||||
|
}
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:default:fork npersocket %d target socket %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
npersocket, target_socket));
|
||||||
|
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||||
|
/* this corresponds to a mapping policy where
|
||||||
|
* local rank 0 goes on socket 0, and local
|
||||||
|
* rank 1 goes on socket 1, etc. - round robin
|
||||||
|
* until all ranks are mapped
|
||||||
|
*
|
||||||
|
* NOTE: we already know our number of sockets
|
||||||
|
* from when we initialized
|
||||||
|
*/
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"bysocket lrank %d numsocks %d logical socket %d", (int)lrank,
|
||||||
|
(int)orte_default_num_sockets_per_board,
|
||||||
|
(int)(lrank % orte_default_num_sockets_per_board)));
|
||||||
|
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_default_num_sockets_per_board);
|
||||||
|
} else {
|
||||||
|
/* use a byslot-like policy where local rank 0 goes on
|
||||||
|
* socket 0, and local rank 1 goes on socket 0, etc.
|
||||||
|
* following round-robing until all ranks mapped
|
||||||
|
*/
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"byslot lrank %d numsocks %d logical socket %d", (int)lrank,
|
||||||
|
(int)orte_default_num_sockets_per_board,
|
||||||
|
(int)(lrank / orte_default_num_cores_per_socket)));
|
||||||
|
target_socket = opal_paffinity_base_get_physical_socket_id(lrank / orte_default_num_cores_per_socket);
|
||||||
|
}
|
||||||
|
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||||
|
for (n=0; n < orte_default_num_cores_per_socket; n++) {
|
||||||
|
phys_core = opal_paffinity_base_get_physical_core_id(target_socket, n);
|
||||||
|
if (0 > phys_core) {
|
||||||
|
orte_show_help("help-odls-default.txt",
|
||||||
|
"odls-default:invalid-phys-cpu", true);
|
||||||
|
rc = ORTE_ERR_FATAL;
|
||||||
|
write(p[1], &rc, sizeof(int));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) {
|
||||||
|
orte_show_help("help-odls-default.txt",
|
||||||
|
"odls-default:invalid-phys-cpu", true);
|
||||||
|
rc = ORTE_ERR_FATAL;
|
||||||
|
write(p[1], &rc, sizeof(int));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
target_socket, n, phys_cpu));
|
||||||
|
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||||
|
}
|
||||||
|
if (orte_odls_globals.report_bindings) {
|
||||||
|
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %04lx",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name), target_socket, mask.bitmask[0]);
|
||||||
|
}
|
||||||
|
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
||||||
|
orte_show_help("help-odls-default.txt",
|
||||||
|
"odls-default:failed-set-paff", true);
|
||||||
|
write(p[1], &rc, sizeof(int));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
paffinity_enabled = true;
|
||||||
}
|
}
|
||||||
OPAL_PAFFINITY_CPU_ZERO(mask);
|
|
||||||
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
|
|
||||||
if (0 > phys_cpu) {
|
|
||||||
orte_show_help("help-odls-default.txt",
|
|
||||||
"odls-default:invalid-phys-cpu", true);
|
|
||||||
rc = ORTE_ERR_FATAL;
|
|
||||||
write(p[1], &rc, sizeof(int));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
|
||||||
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
|
||||||
orte_show_help("help-odls-default.txt",
|
|
||||||
"odls-default:failed-set-paff", true);
|
|
||||||
write(p[1], &rc, sizeof(int));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
paffinity_enabled = true;
|
|
||||||
}
|
}
|
||||||
/* If we were able to set processor affinity, try setting up
|
/* If we were able to set processor affinity, try setting up
|
||||||
* memory affinity
|
* memory affinity
|
||||||
@ -335,15 +445,15 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
|
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||||
/* tie stdin/out/err/internal to /dev/null */
|
/* tie stdin/out/err/internal to /dev/null */
|
||||||
int fdnull;
|
int fdnull;
|
||||||
for (i=0; i < 3; i++) {
|
for (i=0; i < 3; i++) {
|
||||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||||
if(fdnull > i) {
|
if(fdnull > i) {
|
||||||
dup2(fdnull, i);
|
dup2(fdnull, i);
|
||||||
}
|
}
|
||||||
close(fdnull);
|
close(fdnull);
|
||||||
}
|
}
|
||||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||||
if(fdnull > opts.p_internal[1]) {
|
if(fdnull > opts.p_internal[1]) {
|
||||||
@ -396,7 +506,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
|||||||
exit(1);
|
exit(1);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
|
if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||||
/* connect endpoints IOF */
|
/* connect endpoints IOF */
|
||||||
rc = orte_iof_base_setup_parent(child->name, &opts);
|
rc = orte_iof_base_setup_parent(child->name, &opts);
|
||||||
if(ORTE_SUCCESS != rc) {
|
if(ORTE_SUCCESS != rc) {
|
||||||
@ -447,7 +557,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
|||||||
"%s odls:default:fork got code %d back from child",
|
"%s odls:default:fork got code %d back from child",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));
|
||||||
close(p[0]);
|
close(p[0]);
|
||||||
return i;
|
return ORTE_ERR_FAILED_TO_START;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -101,27 +101,30 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t);
|
|||||||
* List object to locally store job related info
|
* List object to locally store job related info
|
||||||
*/
|
*/
|
||||||
typedef struct orte_odls_job_t {
|
typedef struct orte_odls_job_t {
|
||||||
opal_list_item_t super; /* required to place this on a list */
|
opal_list_item_t super; /* required to place this on a list */
|
||||||
orte_job_state_t state; /* state of the job */
|
orte_job_state_t state; /* state of the job */
|
||||||
orte_jobid_t jobid; /* jobid for this data */
|
orte_jobid_t jobid; /* jobid for this data */
|
||||||
bool launch_msg_processed; /* launch msg has been fully processed */
|
bool launch_msg_processed; /* launch msg has been fully processed */
|
||||||
orte_app_context_t **apps; /* app_contexts for this job */
|
orte_app_context_t **apps; /* app_contexts for this job */
|
||||||
orte_std_cntr_t num_apps; /* number of app_contexts */
|
orte_std_cntr_t num_apps; /* number of app_contexts */
|
||||||
orte_job_controls_t controls; /* control flags for job */
|
orte_mapping_policy_t policy; /* mapping policy */
|
||||||
orte_vpid_t stdin_target; /* where stdin is to go */
|
int16_t cpus_per_rank; /* number of cpus/rank */
|
||||||
orte_std_cntr_t total_slots_alloc;
|
int16_t stride; /* step size between cores of multi-core/rank procs */
|
||||||
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
|
orte_job_controls_t controls; /* control flags for job */
|
||||||
orte_vpid_t num_procs;
|
orte_vpid_t stdin_target; /* where stdin is to go */
|
||||||
int32_t num_local_procs;
|
orte_std_cntr_t total_slots_alloc;
|
||||||
char *regexp; /* the regular expression describing the job */
|
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
|
||||||
opal_byte_object_t *pmap; /* local copy of pidmap byte object */
|
orte_vpid_t num_procs;
|
||||||
opal_buffer_t collection_bucket;
|
int32_t num_local_procs;
|
||||||
opal_buffer_t local_collection;
|
char *regexp; /* the regular expression describing the job */
|
||||||
orte_grpcomm_coll_t collective_type;
|
opal_byte_object_t *pmap; /* local copy of pidmap byte object */
|
||||||
int32_t num_contributors;
|
opal_buffer_t collection_bucket;
|
||||||
int num_participating;
|
opal_buffer_t local_collection;
|
||||||
int num_collected;
|
orte_grpcomm_coll_t collective_type;
|
||||||
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
|
int32_t num_contributors;
|
||||||
|
int num_participating;
|
||||||
|
int num_collected;
|
||||||
|
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
|
||||||
} orte_odls_job_t;
|
} orte_odls_job_t;
|
||||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);
|
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);
|
||||||
|
|
||||||
|
@ -95,8 +95,7 @@ static int odls_process_kill_local_procs(opal_pointer_array_t *procs, bool set_s
|
|||||||
static int odls_process_fork_local_proc(orte_app_context_t* context,
|
static int odls_process_fork_local_proc(orte_app_context_t* context,
|
||||||
orte_odls_child_t *child,
|
orte_odls_child_t *child,
|
||||||
char **environ_copy,
|
char **environ_copy,
|
||||||
orte_job_controls_t controls,
|
orte_odls_job_t *jobdat)
|
||||||
orte_vpid_t stdin_target)
|
|
||||||
{
|
{
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
orte_iof_base_io_conf_t opts;
|
orte_iof_base_io_conf_t opts;
|
||||||
@ -124,7 +123,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context,
|
|||||||
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
||||||
|
|
||||||
/* do we want to setup stdin? */
|
/* do we want to setup stdin? */
|
||||||
if (stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target) {
|
if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target) {
|
||||||
opts.connect_stdin = true;
|
opts.connect_stdin = true;
|
||||||
} else {
|
} else {
|
||||||
opts.connect_stdin = false;
|
opts.connect_stdin = false;
|
||||||
|
@ -50,6 +50,7 @@
|
|||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps_types.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
@ -31,6 +31,8 @@
|
|||||||
#include "opal/class/opal_list.h"
|
#include "opal/class/opal_list.h"
|
||||||
#include "opal/mca/mca.h"
|
#include "opal/mca/mca.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/rmaps/rmaps.h"
|
#include "orte/mca/rmaps/rmaps.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
@ -56,14 +58,18 @@ typedef struct {
|
|||||||
opal_list_t available_components;
|
opal_list_t available_components;
|
||||||
/** selected module */
|
/** selected module */
|
||||||
orte_rmaps_base_module_t *active_module;
|
orte_rmaps_base_module_t *active_module;
|
||||||
/* user specified mapping policy */
|
|
||||||
uint8_t policy;
|
|
||||||
/** whether or not we allow oversubscription of nodes */
|
/** whether or not we allow oversubscription of nodes */
|
||||||
bool oversubscribe;
|
bool oversubscribe;
|
||||||
/** do we want one ppn if num_procs not specified */
|
|
||||||
bool pernode;
|
|
||||||
/** number of ppn for n_per_node mode */
|
/** number of ppn for n_per_node mode */
|
||||||
int npernode;
|
int npernode;
|
||||||
|
/* number of procs/board */
|
||||||
|
int nperboard;
|
||||||
|
/* number of procs/socket */
|
||||||
|
int npersocket;
|
||||||
|
/* cpus per rank */
|
||||||
|
int cpus_per_rank;
|
||||||
|
/* stride */
|
||||||
|
int stride;
|
||||||
/* do not allow use of the localhost */
|
/* do not allow use of the localhost */
|
||||||
bool no_use_local;
|
bool no_use_local;
|
||||||
/* display the map after it is computed */
|
/* display the map after it is computed */
|
||||||
|
@ -123,15 +123,14 @@ opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, ort
|
|||||||
*/
|
*/
|
||||||
int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item,
|
opal_list_item_t *cur_node_item)
|
||||||
orte_vpid_t ppn)
|
|
||||||
{
|
{
|
||||||
int rc=ORTE_SUCCESS;
|
int rc=ORTE_SUCCESS;
|
||||||
int i;
|
int i;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
opal_list_item_t *next;
|
opal_list_item_t *next;
|
||||||
orte_vpid_t num_alloc = 0;
|
orte_vpid_t num_alloc = 0;
|
||||||
int num_slots_to_take;
|
int num_procs_to_assign, num_possible_procs;
|
||||||
|
|
||||||
/* This loop continues until all procs have been mapped or we run
|
/* This loop continues until all procs have been mapped or we run
|
||||||
out of resources. We determine that we have "run out of
|
out of resources. We determine that we have "run out of
|
||||||
@ -185,21 +184,37 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
|||||||
* to do so after oversubscribing).
|
* to do so after oversubscribing).
|
||||||
*/
|
*/
|
||||||
if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) {
|
if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) {
|
||||||
num_slots_to_take = (node->slots_alloc == 0) ? 1 : node->slots_alloc;
|
if (0 == node->slots_alloc) {
|
||||||
|
num_procs_to_assign = 1;
|
||||||
|
} else {
|
||||||
|
num_possible_procs = node->slots_alloc / jdata->map->cpus_per_rank;
|
||||||
|
if (0 == num_possible_procs) {
|
||||||
|
num_procs_to_assign = 1;
|
||||||
|
} else {
|
||||||
|
num_procs_to_assign = num_possible_procs;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
num_slots_to_take = node->slots_alloc - node->slots_inuse;
|
num_possible_procs = (node->slots_alloc - node->slots_inuse) / jdata->map->cpus_per_rank;
|
||||||
|
if (0 == num_possible_procs) {
|
||||||
|
num_procs_to_assign = 1;
|
||||||
|
} else {
|
||||||
|
num_procs_to_assign = num_possible_procs;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||||
* to the num_per_node
|
* to the num_per_node
|
||||||
*/
|
*/
|
||||||
if (jdata->map->pernode) {
|
if (0 < jdata->map->npernode) {
|
||||||
num_slots_to_take = jdata->map->npernode;
|
num_procs_to_assign = jdata->map->npernode;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < num_slots_to_take; ++i) {
|
for( i = 0; i < num_procs_to_assign; ++i) {
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||||
node_list, jdata->map->oversubscribe, true))) {
|
jdata->map->cpus_per_rank, app->idx,
|
||||||
|
node_list, jdata->map->oversubscribe,
|
||||||
|
true, NULL))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
* since the node is fully used up. For now, just don't report
|
* since the node is fully used up. For now, just don't report
|
||||||
@ -220,8 +235,7 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* if we have fully used up this node, then break from the loop */
|
/* if we have fully used up this node, then break from the loop */
|
||||||
if (ORTE_ERR_NODE_FULLY_USED == rc ||
|
if (ORTE_ERR_NODE_FULLY_USED == rc) {
|
||||||
(orte_rmaps_base.loadbalance && node->num_procs >= ppn)) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -231,17 +245,13 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
|||||||
* node is NOT max'd out
|
* node is NOT max'd out
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
if (i < (num_slots_to_take-1) && ORTE_ERR_NODE_FULLY_USED != rc &&
|
if (i < (num_procs_to_assign-1) && ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||||
(orte_rmaps_base.loadbalance && node->num_procs < ppn)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
cur_node_item = next;
|
cur_node_item = next;
|
||||||
}
|
}
|
||||||
|
|
||||||
complete:
|
complete:
|
||||||
/* update the starting vpid */
|
|
||||||
vpid_start += num_procs;
|
|
||||||
|
|
||||||
/* save the bookmark */
|
/* save the bookmark */
|
||||||
jdata->bookmark = (orte_node_t*)cur_node_item;
|
jdata->bookmark = (orte_node_t*)cur_node_item;
|
||||||
|
|
||||||
@ -250,7 +260,7 @@ complete:
|
|||||||
|
|
||||||
int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item)
|
opal_list_item_t *cur_node_item)
|
||||||
{
|
{
|
||||||
int rc = ORTE_SUCCESS;
|
int rc = ORTE_SUCCESS;
|
||||||
opal_list_item_t *next;
|
opal_list_item_t *next;
|
||||||
@ -297,8 +307,8 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
|||||||
|
|
||||||
/* Allocate a slot on this node */
|
/* Allocate a slot on this node */
|
||||||
node = (orte_node_t*) cur_node_item;
|
node = (orte_node_t*) cur_node_item;
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
|
||||||
node_list, jdata->map->oversubscribe, true))) {
|
node_list, jdata->map->oversubscribe, true, NULL))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
* since the node is fully used up. For now, just don't report
|
* since the node is fully used up. For now, just don't report
|
||||||
|
@ -67,9 +67,12 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
|
|||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
/* load it with the system defaults */
|
/* load it with the system defaults */
|
||||||
map->policy = orte_rmaps_base.policy;
|
map->policy = orte_default_mapping_policy;
|
||||||
map->pernode = orte_rmaps_base.pernode;
|
|
||||||
map->npernode = orte_rmaps_base.npernode;
|
map->npernode = orte_rmaps_base.npernode;
|
||||||
|
map->nperboard = orte_rmaps_base.nperboard;
|
||||||
|
map->npersocket = orte_rmaps_base.npersocket;
|
||||||
|
map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
|
||||||
|
map->stride = orte_rmaps_base.stride;
|
||||||
map->oversubscribe = orte_rmaps_base.oversubscribe;
|
map->oversubscribe = orte_rmaps_base.oversubscribe;
|
||||||
map->display_map = orte_rmaps_base.display_map;
|
map->display_map = orte_rmaps_base.display_map;
|
||||||
/* assign the map object to this job */
|
/* assign the map object to this job */
|
||||||
|
@ -30,7 +30,9 @@
|
|||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
#include "opal/mca/paffinity/paffinity.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||||
|
|
||||||
@ -92,39 +94,74 @@ int orte_rmaps_base_open(void)
|
|||||||
|
|
||||||
/* Are we scheduling by node or by slot? */
|
/* Are we scheduling by node or by slot? */
|
||||||
param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
|
param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
|
||||||
"Scheduling Policy for RMAPS. [slot | node]",
|
"Scheduling Policy for RMAPS. [slot (default) | socket | board | node]",
|
||||||
false, false, "unspec", &policy);
|
false, false, "unspec", &policy);
|
||||||
|
|
||||||
if (0 == strcmp(policy, "unspec")) {
|
if (0 == strcmp(policy, "socket")) {
|
||||||
orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */
|
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
|
||||||
|
} else if (0 == strcmp(policy, "board")) {
|
||||||
|
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
|
||||||
} else if (0 == strcmp(policy, "node")) {
|
} else if (0 == strcmp(policy, "node")) {
|
||||||
orte_rmaps_base.policy = ORTE_RMAPS_BYNODE;
|
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
|
||||||
} else {
|
|
||||||
orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */
|
|
||||||
}
|
}
|
||||||
|
/* if nothing was specified, leave it alone - we already set it
|
||||||
|
* in orterun
|
||||||
|
*/
|
||||||
|
|
||||||
/* Do we want one ppn if num_procs not specified */
|
/* check for procs/xxx directives */
|
||||||
param = mca_base_param_reg_int_name("rmaps", "base_pernode",
|
param = mca_base_param_reg_int_name("rmaps", "base_pernode",
|
||||||
"Launch one ppn as directed",
|
"Launch one ppn as directed",
|
||||||
false, false, (int)false, &value);
|
false, false, (int)false, &value);
|
||||||
orte_rmaps_base.pernode = OPAL_INT_TO_BOOL(value);
|
if (value) {
|
||||||
|
|
||||||
/* if pernode is set, we do not allow npernode to also be set - instead
|
|
||||||
* we default the npernode value to 1
|
|
||||||
*/
|
|
||||||
if (orte_rmaps_base.pernode) {
|
|
||||||
orte_rmaps_base.npernode = 1;
|
orte_rmaps_base.npernode = 1;
|
||||||
} else {
|
|
||||||
/* Do we want n ppn */
|
|
||||||
param = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
|
|
||||||
"Launch n procs/node",
|
|
||||||
false, false, 0, &value);
|
|
||||||
orte_rmaps_base.npernode = value;
|
|
||||||
if (0 < orte_rmaps_base.npernode) {
|
|
||||||
orte_rmaps_base.pernode = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* #procs/node */
|
||||||
|
param = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
|
||||||
|
"Launch n procs/node",
|
||||||
|
false, false, -1, &value);
|
||||||
|
if (0 < value) {
|
||||||
|
orte_rmaps_base.npernode = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* #procs/board */
|
||||||
|
param = mca_base_param_reg_int_name("rmaps", "base_n_perboard",
|
||||||
|
"Launch n procs/board",
|
||||||
|
false, false, -1, &orte_rmaps_base.nperboard);
|
||||||
|
if (0 < orte_rmaps_base.nperboard) {
|
||||||
|
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* #procs/socket */
|
||||||
|
param = mca_base_param_reg_int_name("rmaps", "base_n_persocket",
|
||||||
|
"Launch n procs/socket",
|
||||||
|
false, false, -1, &orte_rmaps_base.npersocket);
|
||||||
|
if (0 < orte_rmaps_base.npersocket) {
|
||||||
|
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Do we want to loadbalance the job */
|
||||||
|
param = mca_base_param_reg_int_name("rmaps", "base_loadbalance",
|
||||||
|
"Balance total number of procs across all allocated nodes",
|
||||||
|
false, false, (int)false, &value);
|
||||||
|
orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
/* #cpus/rank to use */
|
||||||
|
param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_rank",
|
||||||
|
"Number of cpus to use for each rank [1-2**15 (default=1)]",
|
||||||
|
false, false, 1, &value);
|
||||||
|
orte_rmaps_base.cpus_per_rank = value;
|
||||||
|
/* if the cpus/rank > 1, then we have to bind to cores */
|
||||||
|
if (1 < orte_rmaps_base.cpus_per_rank) {
|
||||||
|
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* stride to use */
|
||||||
|
param = mca_base_param_reg_int_name("rmaps", "base_stride",
|
||||||
|
"When binding multiple cores to a rank, the step size to use between cores [1-2**15 (default: 1)]",
|
||||||
|
false, false, 1, &value);
|
||||||
|
orte_rmaps_base.stride = value;
|
||||||
|
|
||||||
/* did the user provide a slot list? */
|
/* did the user provide a slot list? */
|
||||||
param = mca_base_param_reg_string_name("rmaps", "base_slot_list",
|
param = mca_base_param_reg_string_name("rmaps", "base_slot_list",
|
||||||
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]",
|
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]",
|
||||||
@ -136,7 +173,7 @@ int orte_rmaps_base_open(void)
|
|||||||
"If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun",
|
"If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun",
|
||||||
false, false, (int)false, &value);
|
false, false, (int)false, &value);
|
||||||
if (value) {
|
if (value) {
|
||||||
orte_rmaps_base.policy |= ORTE_RMAPS_NO_USE_LOCAL;
|
orte_default_mapping_policy |= ORTE_MAPPING_NO_USE_LOCAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Should we oversubscribe or not? */
|
/* Should we oversubscribe or not? */
|
||||||
@ -150,16 +187,6 @@ int orte_rmaps_base_open(void)
|
|||||||
orte_rmaps_base.oversubscribe = true;
|
orte_rmaps_base.oversubscribe = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Do we want to loadbalance the job */
|
|
||||||
param = mca_base_param_reg_int_name("rmaps", "base_loadbalance",
|
|
||||||
"Balance total number of procs across all allocated nodes",
|
|
||||||
false, false, (int)false, &value);
|
|
||||||
orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value);
|
|
||||||
/* if we are doing npernode or pernode, then we cannot loadbalance */
|
|
||||||
if (orte_rmaps_base.pernode) {
|
|
||||||
orte_rmaps_base.loadbalance = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* should we display the map after determining it? */
|
/* should we display the map after determining it? */
|
||||||
mca_base_param_reg_int_name("rmaps", "base_display_map",
|
mca_base_param_reg_int_name("rmaps", "base_display_map",
|
||||||
"Whether to display the process map after it is computed",
|
"Whether to display the process map after it is computed",
|
||||||
|
@ -41,7 +41,7 @@
|
|||||||
* Query the registry for all nodes allocated to a specified app_context
|
* Query the registry for all nodes allocated to a specified app_context
|
||||||
*/
|
*/
|
||||||
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
|
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
|
||||||
orte_app_context_t *app, uint8_t policy)
|
orte_app_context_t *app, orte_mapping_policy_t policy)
|
||||||
{
|
{
|
||||||
opal_list_item_t *item, *next;
|
opal_list_item_t *item, *next;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
@ -169,7 +169,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
|||||||
/* If the "no local" option was set, then remove the local node
|
/* If the "no local" option was set, then remove the local node
|
||||||
* from the list
|
* from the list
|
||||||
*/
|
*/
|
||||||
if (policy & ORTE_RMAPS_NO_USE_LOCAL) {
|
if (policy & ORTE_MAPPING_NO_USE_LOCAL) {
|
||||||
/* we don't need to check through the entire list as
|
/* we don't need to check through the entire list as
|
||||||
* the head node - if it is on the list at all - will
|
* the head node - if it is on the list at all - will
|
||||||
* always be in the first position
|
* always be in the first position
|
||||||
@ -267,9 +267,9 @@ PROCESS:
|
|||||||
* in the mapper
|
* in the mapper
|
||||||
*/
|
*/
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||||
"%s rmaps:base: mapping proc %s to node %s",
|
"%s rmaps:base: mapping proc for job %s to node %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&proc->name),
|
ORTE_JOBID_PRINT(proc->name.jobid),
|
||||||
(NULL == node->name) ? "NULL" : node->name));
|
(NULL == node->name) ? "NULL" : node->name));
|
||||||
|
|
||||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||||
@ -289,88 +289,56 @@ PROCESS:
|
|||||||
*/
|
*/
|
||||||
int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||||
orte_node_t *current_node,
|
orte_node_t *current_node,
|
||||||
orte_vpid_t vpid,
|
int32_t cpus_per_rank,
|
||||||
char *slot_list,
|
|
||||||
orte_std_cntr_t app_idx,
|
orte_std_cntr_t app_idx,
|
||||||
opal_list_t *nodes,
|
opal_list_t *nodes,
|
||||||
bool oversubscribe,
|
bool oversubscribe,
|
||||||
bool remove_from_list)
|
bool remove_from_list,
|
||||||
|
orte_proc_t **returnproc)
|
||||||
{
|
{
|
||||||
orte_proc_t *proc, *proc_from_job;
|
orte_proc_t *proc;
|
||||||
bool oversub;
|
bool oversub;
|
||||||
int rc;
|
int rc;
|
||||||
int n;
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
/* if we were given a proc, just use it */
|
||||||
"%s rmaps:base:claim_slot: checking for existence of vpid %s",
|
if (NULL != returnproc && NULL != *returnproc) {
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
proc = *returnproc;
|
||||||
ORTE_VPID_PRINT(vpid)));
|
} else {
|
||||||
|
/* create mapped_proc object */
|
||||||
/* does this proc already exist within the job? */
|
|
||||||
proc = NULL;
|
|
||||||
for (n=0; n < jdata->procs->size; n++) {
|
|
||||||
if (NULL == (proc_from_job = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (proc_from_job->name.vpid == vpid) {
|
|
||||||
/* already have it! */
|
|
||||||
proc = proc_from_job;
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
|
||||||
"%s rmaps:base:claim_slot: found existing proc %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&proc->name)));
|
|
||||||
|
|
||||||
if (NULL != proc->slot_list) {
|
|
||||||
/* cleanout stale info */
|
|
||||||
free(proc->slot_list);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (NULL == proc) {
|
|
||||||
/* need to create mapped_proc object */
|
|
||||||
proc = OBJ_NEW(orte_proc_t);
|
proc = OBJ_NEW(orte_proc_t);
|
||||||
if (NULL == proc) {
|
if (NULL == proc) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
/* create the process name */
|
/* set the jobid */
|
||||||
proc->name.jobid = jdata->jobid;
|
proc->name.jobid = jdata->jobid;
|
||||||
proc->name.vpid = vpid;
|
/* we do not set the vpid here - this will be done
|
||||||
|
* during a second phase
|
||||||
|
*/
|
||||||
proc->app_idx = app_idx;
|
proc->app_idx = app_idx;
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||||
"%s rmaps:base:claim_slot: created new proc %s",
|
"%s rmaps:base:claim_slot: created new proc %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&proc->name)));
|
ORTE_NAME_PRINT(&proc->name)));
|
||||||
/* add this proc to the job's data - we don't have to worry here
|
|
||||||
* about keeping the array left-justified as all vpids
|
/* provide returned proc, if requested */
|
||||||
* from 0 to num_procs will be filled
|
if (NULL != returnproc) {
|
||||||
*/
|
*returnproc = proc;
|
||||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
|
||||||
(int)vpid,
|
|
||||||
(void*)proc))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
OBJ_RELEASE(proc);
|
|
||||||
return rc;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_RETAIN(current_node); /* maintain accounting on object */
|
OBJ_RETAIN(current_node); /* maintain accounting on object */
|
||||||
|
|
||||||
if ( NULL != slot_list) {
|
|
||||||
proc->slot_list = strdup(slot_list);
|
|
||||||
}
|
|
||||||
proc->node = current_node;
|
proc->node = current_node;
|
||||||
proc->nodename = current_node->name;
|
proc->nodename = current_node->name;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||||
"%s rmaps:base:claim_slot mapping rank %d in job %s to node %s",
|
"%s rmaps:base:claim_slot mapping proc in job %s to node %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
vpid, ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
|
ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
|
||||||
|
|
||||||
/* Be sure to demarcate this slot as claimed for the node */
|
/* Be sure to demarcate the slots for this proc as claimed from the node */
|
||||||
current_node->slots_inuse++;
|
current_node->slots_inuse += cpus_per_rank;
|
||||||
|
|
||||||
/* see if this node is oversubscribed now */
|
/* see if this node is oversubscribed now */
|
||||||
if (current_node->slots_inuse > current_node->slots) {
|
if (current_node->slots_inuse > current_node->slots) {
|
||||||
@ -415,8 +383,68 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||||
|
{
|
||||||
|
orte_job_map_t *map;
|
||||||
|
orte_vpid_t vpid;
|
||||||
|
int i, j;
|
||||||
|
orte_node_t *node;
|
||||||
|
orte_proc_t *proc;
|
||||||
|
int rc;
|
||||||
|
|
||||||
int orte_rmaps_base_compute_usage(orte_job_t *jdata)
|
map = jdata->map;
|
||||||
|
|
||||||
|
if (ORTE_MAPPING_BYSLOT & map->policy ||
|
||||||
|
ORTE_MAPPING_BYSOCKET & map->policy ||
|
||||||
|
ORTE_MAPPING_BYBOARD & map->policy) {
|
||||||
|
/* assign the ranks sequentially */
|
||||||
|
vpid = 0;
|
||||||
|
for (i=0; i < map->nodes->size; i++) {
|
||||||
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (j=0; j < node->procs->size; j++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
proc->name.vpid = vpid++;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
||||||
|
proc->name.vpid, proc))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ORTE_MAPPING_BYNODE & map->policy) {
|
||||||
|
/* assign the ranks round-robin across nodes */
|
||||||
|
for (i=0; i < map->nodes->size; i++) {
|
||||||
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
vpid = i;
|
||||||
|
for (j=0; j < node->procs->size; j++) {
|
||||||
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
proc->name.vpid = vpid;
|
||||||
|
vpid += map->num_nodes;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
||||||
|
proc->name.vpid, proc))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata)
|
||||||
{
|
{
|
||||||
orte_std_cntr_t i;
|
orte_std_cntr_t i;
|
||||||
int j, k;
|
int j, k;
|
||||||
@ -501,8 +529,8 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata)
|
|||||||
* we don't, then it would be possible for procs to conflict
|
* we don't, then it would be possible for procs to conflict
|
||||||
* when opening static ports, should that be enabled.
|
* when opening static ports, should that be enabled.
|
||||||
*/
|
*/
|
||||||
void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode,
|
void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
|
||||||
orte_node_t *newnode, orte_proc_t *newproc)
|
orte_node_t *newnode, orte_proc_t *newproc)
|
||||||
{
|
{
|
||||||
int k;
|
int k;
|
||||||
orte_node_rank_t node_rank;
|
orte_node_rank_t node_rank;
|
||||||
|
@ -61,7 +61,7 @@ int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node,
|
|||||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
|
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
|
||||||
orte_std_cntr_t *total_num_slots,
|
orte_std_cntr_t *total_num_slots,
|
||||||
orte_app_context_t *app,
|
orte_app_context_t *app,
|
||||||
uint8_t policy);
|
orte_mapping_policy_t policy);
|
||||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_procs(opal_list_t *procs);
|
ORTE_DECLSPEC int orte_rmaps_base_get_target_procs(opal_list_t *procs);
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_rmaps_base_update_node_usage(opal_list_t *nodes);
|
ORTE_DECLSPEC int orte_rmaps_base_update_node_usage(opal_list_t *nodes);
|
||||||
@ -72,17 +72,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_li
|
|||||||
|
|
||||||
ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||||
orte_node_t *current_node,
|
orte_node_t *current_node,
|
||||||
orte_vpid_t vpid,
|
int32_t stride,
|
||||||
char *slot_list,
|
|
||||||
orte_std_cntr_t app_idx,
|
orte_std_cntr_t app_idx,
|
||||||
opal_list_t *nodes,
|
opal_list_t *nodes,
|
||||||
bool oversubscribe,
|
bool oversubscribe,
|
||||||
bool remove_from_list);
|
bool remove_from_list,
|
||||||
|
orte_proc_t **returnproc);
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_rmaps_base_compute_usage(orte_job_t *jdata);
|
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
|
||||||
|
|
||||||
ORTE_DECLSPEC void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode,
|
ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata);
|
||||||
orte_node_t *newnode, orte_proc_t *newproc);
|
|
||||||
|
ORTE_DECLSPEC void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
|
||||||
|
orte_node_t *newnode, orte_proc_t *newproc);
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs);
|
ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs);
|
||||||
|
|
||||||
@ -93,12 +95,11 @@ ORTE_DECLSPEC opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *
|
|||||||
|
|
||||||
ORTE_DECLSPEC int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
ORTE_DECLSPEC int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item,
|
opal_list_item_t *cur_node_item);
|
||||||
orte_vpid_t ppn);
|
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
ORTE_DECLSPEC int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item);
|
opal_list_item_t *cur_node_item);
|
||||||
|
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
45
orte/mca/rmaps/load_balance/Makefile.am
Обычный файл
45
orte/mca/rmaps/load_balance/Makefile.am
Обычный файл
@ -0,0 +1,45 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
dist_pkgdata_DATA = help-orte-rmaps-lb.txt
|
||||||
|
|
||||||
|
sources = \
|
||||||
|
rmaps_lb.c \
|
||||||
|
rmaps_lb.h \
|
||||||
|
rmaps_lb_component.c
|
||||||
|
|
||||||
|
# Make the output library in this directory, and name it either
|
||||||
|
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||||
|
# (for static builds).
|
||||||
|
|
||||||
|
if OMPI_BUILD_rmaps_load_balance_DSO
|
||||||
|
component_noinst =
|
||||||
|
component_install = mca_rmaps_load_balance.la
|
||||||
|
else
|
||||||
|
component_noinst = libmca_rmaps_load_balance.la
|
||||||
|
component_install =
|
||||||
|
endif
|
||||||
|
|
||||||
|
mcacomponentdir = $(pkglibdir)
|
||||||
|
mcacomponent_LTLIBRARIES = $(component_install)
|
||||||
|
mca_rmaps_load_balance_la_SOURCES = $(sources)
|
||||||
|
mca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = $(component_noinst)
|
||||||
|
libmca_rmaps_load_balance_la_SOURCES =$(sources)
|
||||||
|
libmca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
|
24
orte/mca/rmaps/load_balance/configure.params
Обычный файл
24
orte/mca/rmaps/load_balance/configure.params
Обычный файл
@ -0,0 +1,24 @@
|
|||||||
|
# -*- shell-script -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||||
|
# reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
|
||||||
|
# Specific to this module
|
||||||
|
|
||||||
|
PARAM_CONFIG_FILES="Makefile"
|
53
orte/mca/rmaps/load_balance/help-orte-rmaps-lb.txt
Обычный файл
53
orte/mca/rmaps/load_balance/help-orte-rmaps-lb.txt
Обычный файл
@ -0,0 +1,53 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the US/English general help file for Open RTE's orterun.
|
||||||
|
#
|
||||||
|
[orte-rmaps-rr:alloc-error]
|
||||||
|
There are not enough slots available in the system to satisfy the %d slots
|
||||||
|
that were requested by the application:
|
||||||
|
%s
|
||||||
|
|
||||||
|
Either request fewer slots for your application, or make more slots available
|
||||||
|
for use.
|
||||||
|
[orte-rmaps-rr:multi-apps-and-zero-np]
|
||||||
|
RMAPS found multiple applications to be launched, with
|
||||||
|
at least one that failed to specify the number of processes to execute.
|
||||||
|
When specifying multiple applications, you must specify how many processes
|
||||||
|
of each to launch via the -np argument.
|
||||||
|
|
||||||
|
[orte-rmaps-rr:per-node-and-too-many-procs]
|
||||||
|
There are not enough nodes in your allocation to satisfy your request to launch
|
||||||
|
%d processes on a per-node basis - only %d nodes were available.
|
||||||
|
|
||||||
|
Either request fewer processes, or obtain a larger allocation.
|
||||||
|
[orte-rmaps-rr:n-per-node-and-too-many-procs]
|
||||||
|
There are not enough nodes in your allocation to satisfy your request to launch
|
||||||
|
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
|
||||||
|
|
||||||
|
Either request fewer processes, or obtain a larger allocation.
|
||||||
|
[orte-rmaps-rr:n-per-node-and-not-enough-slots]
|
||||||
|
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
|
||||||
|
|
||||||
|
Either request fewer processes/node, or obtain a larger allocation.
|
||||||
|
|
||||||
|
[orte-rmaps-rr:no-np-and-user-map]
|
||||||
|
You have specified a rank-to-node/slot mapping, but failed to provide
|
||||||
|
the number of processes to be executed. For some reason, this information
|
||||||
|
could not be obtained from the mapping you provided, so we cannot continue
|
||||||
|
with executing the specified application.
|
430
orte/mca/rmaps/load_balance/rmaps_lb.c
Обычный файл
430
orte/mca/rmaps/load_balance/rmaps_lb.c
Обычный файл
@ -0,0 +1,430 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
#include "orte/constants.h"
|
||||||
|
#include "orte/types.h"
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
#ifdef HAVE_UNISTD_H
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif /* HAVE_UNISTD_H */
|
||||||
|
#ifdef HAVE_STRING_H
|
||||||
|
#include <string.h>
|
||||||
|
#endif /* HAVE_STRING_H */
|
||||||
|
|
||||||
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
|
||||||
|
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||||
|
#include "orte/mca/rmaps/base/base.h"
|
||||||
|
#include "rmaps_lb.h"
|
||||||
|
|
||||||
|
static int switchyard(orte_job_t *jdata);
|
||||||
|
|
||||||
|
orte_rmaps_base_module_t orte_rmaps_load_balance_module = {
|
||||||
|
switchyard
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Local functions */
|
||||||
|
static int npernode(orte_job_t *jdata);
|
||||||
|
static int nperboard(orte_job_t *jdata);
|
||||||
|
static int npersocket(orte_job_t *jdata);
|
||||||
|
static int loadbalance(orte_job_t *jdata);
|
||||||
|
|
||||||
|
static int switchyard(orte_job_t *jdata)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if (0 < orte_rmaps_base.npernode) {
|
||||||
|
rc = npernode(jdata);
|
||||||
|
} else if (0 < orte_rmaps_base.nperboard) {
|
||||||
|
rc = nperboard(jdata);
|
||||||
|
} else if (0 < orte_rmaps_base.npersocket) {
|
||||||
|
rc = npersocket(jdata);
|
||||||
|
} else {
|
||||||
|
rc = loadbalance(jdata);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* compute vpids and add proc objects to the job */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* compute and save local ranks */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* define the daemons that we will use for this job */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* place specified #procs on each node, up to the specified total
|
||||||
|
* number of procs (if one was given).
|
||||||
|
*/
|
||||||
|
static int npernode(orte_job_t *jdata)
|
||||||
|
{
|
||||||
|
orte_app_context_t *app;
|
||||||
|
int i, j, rc=ORTE_SUCCESS;
|
||||||
|
opal_list_t node_list;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
orte_std_cntr_t num_slots;
|
||||||
|
orte_node_t *node;
|
||||||
|
int total_procs, np;
|
||||||
|
|
||||||
|
/* setup the node list */
|
||||||
|
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||||
|
|
||||||
|
/* loop through the app_contexts */
|
||||||
|
for(i=0; i < jdata->apps->size; i++) {
|
||||||
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* use the number of procs if one was given */
|
||||||
|
if (0 < app->num_procs) {
|
||||||
|
np = app->num_procs;
|
||||||
|
} else {
|
||||||
|
np = INT_MAX;
|
||||||
|
}
|
||||||
|
total_procs = 0;
|
||||||
|
/* for each app_context, we have to get the list of nodes that it can
|
||||||
|
* use since that can now be modified with a hostfile and/or -host
|
||||||
|
* option
|
||||||
|
*/
|
||||||
|
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||||
|
jdata->map->policy))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
/* loop through the list of nodes */
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
node = (orte_node_t*)item;
|
||||||
|
/* put the specified number of procs on each node */
|
||||||
|
for (j=0; j < orte_rmaps_base.npernode && total_procs < np; j++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||||
|
jdata->map->cpus_per_rank, app->idx,
|
||||||
|
&node_list, jdata->map->oversubscribe,
|
||||||
|
false, NULL))) {
|
||||||
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||||
|
* more procs to place, then that is an error
|
||||||
|
*/
|
||||||
|
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||||
|
j < orte_rmaps_base.npernode-1) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total_procs++;
|
||||||
|
}
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
jdata->num_procs = total_procs;
|
||||||
|
|
||||||
|
error:
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&node_list);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int nperboard(orte_job_t *jdata)
|
||||||
|
{
|
||||||
|
orte_app_context_t *app;
|
||||||
|
int i, j, k, rc=ORTE_SUCCESS;
|
||||||
|
opal_list_t node_list;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
orte_std_cntr_t num_slots;
|
||||||
|
orte_node_t *node;
|
||||||
|
int total_procs, np;
|
||||||
|
|
||||||
|
/* setup the node list */
|
||||||
|
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||||
|
|
||||||
|
/* loop through the app_contexts */
|
||||||
|
for(i=0; i < jdata->apps->size; i++) {
|
||||||
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* use the number of procs if one was given */
|
||||||
|
if (0 < app->num_procs) {
|
||||||
|
np = app->num_procs;
|
||||||
|
} else {
|
||||||
|
np = INT_MAX;
|
||||||
|
}
|
||||||
|
total_procs = 0;
|
||||||
|
/* for each app_context, we have to get the list of nodes that it can
|
||||||
|
* use since that can now be modified with a hostfile and/or -host
|
||||||
|
* option
|
||||||
|
*/
|
||||||
|
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||||
|
jdata->map->policy))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
/* loop through the list of nodes */
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
node = (orte_node_t*)item;
|
||||||
|
/* loop through the number of boards in this node */
|
||||||
|
for (k=0; k < node->boards && total_procs < np; k++) {
|
||||||
|
/* put the specified number of procs on each board */
|
||||||
|
for (j=0; j < orte_rmaps_base.nperboard && total_procs < np; j++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||||
|
jdata->map->cpus_per_rank, app->idx,
|
||||||
|
&node_list, jdata->map->oversubscribe,
|
||||||
|
false, NULL))) {
|
||||||
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||||
|
* more procs to place, then that is an error
|
||||||
|
*/
|
||||||
|
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||||
|
j < orte_rmaps_base.nperboard-1) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total_procs++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
jdata->num_procs = total_procs;
|
||||||
|
|
||||||
|
error:
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&node_list);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int npersocket(orte_job_t *jdata)
|
||||||
|
{
|
||||||
|
orte_app_context_t *app;
|
||||||
|
int i, j, k, n, rc=ORTE_SUCCESS;
|
||||||
|
opal_list_t node_list;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
orte_std_cntr_t num_slots;
|
||||||
|
orte_node_t *node;
|
||||||
|
int total_procs, np;
|
||||||
|
|
||||||
|
/* setup the node list */
|
||||||
|
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||||
|
|
||||||
|
/* loop through the app_contexts */
|
||||||
|
for(i=0; i < jdata->apps->size; i++) {
|
||||||
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* use the number of procs if one was given */
|
||||||
|
if (0 < app->num_procs) {
|
||||||
|
np = app->num_procs;
|
||||||
|
} else {
|
||||||
|
np = INT_MAX;
|
||||||
|
}
|
||||||
|
total_procs = 0;
|
||||||
|
/* for each app_context, we have to get the list of nodes that it can
|
||||||
|
* use since that can now be modified with a hostfile and/or -host
|
||||||
|
* option
|
||||||
|
*/
|
||||||
|
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||||
|
jdata->map->policy))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
/* loop through the list of nodes */
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
node = (orte_node_t*)item;
|
||||||
|
/* loop through the number of boards in this node */
|
||||||
|
for (k=0; k < node->boards && total_procs < np; k++) {
|
||||||
|
/* loop through the number of sockets/board */
|
||||||
|
for (n=0; n < node->sockets_per_board && total_procs < np; n++) {
|
||||||
|
/* put the specified number of procs on each socket */
|
||||||
|
for (j=0; j < orte_rmaps_base.npersocket && total_procs < np; j++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||||
|
jdata->map->cpus_per_rank, app->idx,
|
||||||
|
&node_list, jdata->map->oversubscribe,
|
||||||
|
false, NULL))) {
|
||||||
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||||
|
* more procs to place, then that is an error
|
||||||
|
*/
|
||||||
|
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||||
|
j < orte_rmaps_base.npersocket-1) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* track the number of procs */
|
||||||
|
total_procs++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
jdata->num_procs = total_procs;
|
||||||
|
|
||||||
|
error:
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&node_list);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a load balanced mapping for the job by assigning a constant #procs/node, with
|
||||||
|
* leftovers being spread one/node starting from the first node.
|
||||||
|
*/
|
||||||
|
static int loadbalance(orte_job_t *jdata)
|
||||||
|
{
|
||||||
|
orte_app_context_t *app;
|
||||||
|
int i, j;
|
||||||
|
opal_list_t node_list;
|
||||||
|
orte_std_cntr_t num_nodes, num_slots;
|
||||||
|
int rc=ORTE_SUCCESS, total_procs;
|
||||||
|
int ppn = 0;
|
||||||
|
opal_list_item_t *item, *start;
|
||||||
|
orte_node_t *node;
|
||||||
|
|
||||||
|
/* setup */
|
||||||
|
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||||
|
|
||||||
|
/* compute total #procs we are going to add and the total number of nodes available */
|
||||||
|
for(i=0; i < jdata->apps->size; i++) {
|
||||||
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* get the nodes and #slots available for this app_context */
|
||||||
|
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||||
|
jdata->map->policy))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (0 == app->num_procs) {
|
||||||
|
/* set the num_procs to the #slots */
|
||||||
|
app->num_procs = num_slots;
|
||||||
|
}
|
||||||
|
num_nodes = opal_list_get_size(&node_list);
|
||||||
|
/* compute the base ppn */
|
||||||
|
ppn = app->num_procs / num_nodes;
|
||||||
|
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||||
|
start = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||||
|
/* loop through the list of nodes until we either assign all the procs
|
||||||
|
* or return to the starting point
|
||||||
|
*/
|
||||||
|
total_procs = 0;
|
||||||
|
item = start;
|
||||||
|
do {
|
||||||
|
node = (orte_node_t*)item;
|
||||||
|
/* put the specified number of procs on each node */
|
||||||
|
for (j=0; j < ppn; j++) {
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||||
|
jdata->map->cpus_per_rank, app->idx,
|
||||||
|
&node_list, jdata->map->oversubscribe,
|
||||||
|
false, NULL))) {
|
||||||
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||||
|
* more procs to place, then that is an error
|
||||||
|
*/
|
||||||
|
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||||
|
j < ppn-1) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total_procs++;
|
||||||
|
}
|
||||||
|
/* move to next node */
|
||||||
|
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
|
||||||
|
item = opal_list_get_first(&node_list);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
item = opal_list_get_next(item);
|
||||||
|
}
|
||||||
|
} while (item != start);
|
||||||
|
|
||||||
|
/* save the bookmark */
|
||||||
|
jdata->bookmark = node;
|
||||||
|
|
||||||
|
/* if we haven't assigned all the procs, then loop through the list
|
||||||
|
* again, assigning 1 per node until all are assigned
|
||||||
|
*/
|
||||||
|
item = start;
|
||||||
|
while (total_procs < app->num_procs) {
|
||||||
|
node = (orte_node_t*)item;
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||||
|
jdata->map->cpus_per_rank, app->idx,
|
||||||
|
&node_list, jdata->map->oversubscribe,
|
||||||
|
false, NULL))) {
|
||||||
|
/* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
|
||||||
|
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total_procs++;
|
||||||
|
/* move to next node */
|
||||||
|
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
|
||||||
|
item = opal_list_get_first(&node_list);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
item = opal_list_get_next(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* save the bookmark */
|
||||||
|
jdata->bookmark = node;
|
||||||
|
|
||||||
|
/* cleanup */
|
||||||
|
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* record the number of procs */
|
||||||
|
jdata->num_procs = total_procs;
|
||||||
|
|
||||||
|
error:
|
||||||
|
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||||
|
OBJ_RELEASE(item);
|
||||||
|
}
|
||||||
|
OBJ_DESTRUCT(&node_list);
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
37
orte/mca/rmaps/load_balance/rmaps_lb.h
Обычный файл
37
orte/mca/rmaps/load_balance/rmaps_lb.h
Обычный файл
@ -0,0 +1,37 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* @file
|
||||||
|
*
|
||||||
|
* Resource Mapping
|
||||||
|
*/
|
||||||
|
#ifndef ORTE_RMAPS_LB_H
|
||||||
|
#define ORTE_RMAPS_LB_H
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps.h"
|
||||||
|
|
||||||
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
|
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_load_balance_component;
|
||||||
|
extern orte_rmaps_base_module_t orte_rmaps_load_balance_module;
|
||||||
|
|
||||||
|
|
||||||
|
END_C_DECLS
|
||||||
|
|
||||||
|
#endif
|
96
orte/mca/rmaps/load_balance/rmaps_lb_component.c
Обычный файл
96
orte/mca/rmaps/load_balance/rmaps_lb_component.c
Обычный файл
@ -0,0 +1,96 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||||
|
* University Research and Technology
|
||||||
|
* Corporation. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
* of Tennessee Research Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
* University of Stuttgart. All rights reserved.
|
||||||
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
* All rights reserved.
|
||||||
|
* $COPYRIGHT$
|
||||||
|
*
|
||||||
|
* Additional copyrights may follow
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "orte_config.h"
|
||||||
|
#include "orte/constants.h"
|
||||||
|
|
||||||
|
#include "opal/mca/base/base.h"
|
||||||
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
|
||||||
|
#include "orte/mca/rmaps/base/base.h"
|
||||||
|
#include "rmaps_lb.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Local functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int orte_rmaps_lb_open(void);
|
||||||
|
static int orte_rmaps_lb_close(void);
|
||||||
|
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority);
|
||||||
|
|
||||||
|
|
||||||
|
orte_rmaps_base_component_t mca_rmaps_load_balance_component = {
|
||||||
|
{
|
||||||
|
ORTE_RMAPS_BASE_VERSION_2_0_0,
|
||||||
|
|
||||||
|
"load_balance", /* MCA component name */
|
||||||
|
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||||
|
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||||
|
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||||
|
orte_rmaps_lb_open, /* component open */
|
||||||
|
orte_rmaps_lb_close, /* component close */
|
||||||
|
orte_rmaps_lb_query /* component query */
|
||||||
|
},
|
||||||
|
{
|
||||||
|
/* The component is checkpoint ready */
|
||||||
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* component open/close/init function
|
||||||
|
*/
|
||||||
|
static int orte_rmaps_lb_open(void)
|
||||||
|
{
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority)
|
||||||
|
{
|
||||||
|
/* the RMAPS framework is -only- opened on HNP's,
|
||||||
|
* so no need to check for that here
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* if load balancing, or any nperxxx, was requested, then we must be selected */
|
||||||
|
if (orte_rmaps_base.loadbalance ||
|
||||||
|
0 < orte_rmaps_base.npernode ||
|
||||||
|
0 < orte_rmaps_base.nperboard ||
|
||||||
|
0 < orte_rmaps_base.npersocket) {
|
||||||
|
*priority = 1000; /* must be selected */
|
||||||
|
*module = (mca_base_module_t *)&orte_rmaps_load_balance_module;
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* otherwise, ignore us */
|
||||||
|
*priority = 0;
|
||||||
|
*module = NULL;
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close all subsystems.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int orte_rmaps_lb_close(void)
|
||||||
|
{
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -72,6 +72,7 @@ static int map_app_by_node(orte_app_context_t* app,
|
|||||||
opal_list_item_t *next;
|
opal_list_item_t *next;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
orte_std_cntr_t num_alloc = 0;
|
orte_std_cntr_t num_alloc = 0;
|
||||||
|
orte_proc_t *proc;
|
||||||
|
|
||||||
/* This loop continues until all procs have been mapped or we run
|
/* This loop continues until all procs have been mapped or we run
|
||||||
out of resources. We determine that we have "run out of
|
out of resources. We determine that we have "run out of
|
||||||
@ -118,8 +119,8 @@ static int map_app_by_node(orte_app_context_t* app,
|
|||||||
/* Allocate a slot on this node */
|
/* Allocate a slot on this node */
|
||||||
node = (orte_node_t*) cur_node_item;
|
node = (orte_node_t*) cur_node_item;
|
||||||
/* pass the base slot list in case it was provided */
|
/* pass the base slot list in case it was provided */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||||
nodes, jdata->map->oversubscribe, true))) {
|
nodes, jdata->map->oversubscribe, true, &proc))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
* since the node is fully used up. For now, just don't report
|
* since the node is fully used up. For now, just don't report
|
||||||
@ -130,6 +131,9 @@ static int map_app_by_node(orte_app_context_t* app,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (NULL != orte_rmaps_base.slot_list) {
|
||||||
|
proc->slot_list = strdup(orte_rmaps_base.slot_list);
|
||||||
|
}
|
||||||
++num_alloc;
|
++num_alloc;
|
||||||
cur_node_item = next;
|
cur_node_item = next;
|
||||||
}
|
}
|
||||||
@ -150,6 +154,7 @@ static int map_app_by_slot(orte_app_context_t* app,
|
|||||||
orte_std_cntr_t i, num_slots_to_take, num_alloc = 0;
|
orte_std_cntr_t i, num_slots_to_take, num_alloc = 0;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
opal_list_item_t *next;
|
opal_list_item_t *next;
|
||||||
|
orte_proc_t *proc;
|
||||||
|
|
||||||
/* This loop continues until all procs have been mapped or we run
|
/* This loop continues until all procs have been mapped or we run
|
||||||
out of resources. We determine that we have "run out of
|
out of resources. We determine that we have "run out of
|
||||||
@ -211,7 +216,7 @@ static int map_app_by_slot(orte_app_context_t* app,
|
|||||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||||
* to the num_per_node
|
* to the num_per_node
|
||||||
*/
|
*/
|
||||||
if (jdata->map->pernode) {
|
if (0 < jdata->map->npernode) {
|
||||||
num_slots_to_take = jdata->map->npernode;
|
num_slots_to_take = jdata->map->npernode;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -223,8 +228,8 @@ static int map_app_by_slot(orte_app_context_t* app,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* pass the base slot list in case it was provided */
|
/* pass the base slot list in case it was provided */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||||
nodes, jdata->map->oversubscribe, true))) {
|
nodes, jdata->map->oversubscribe, true, &proc))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
* since the node is fully used up. For now, just don't report
|
* since the node is fully used up. For now, just don't report
|
||||||
@ -235,6 +240,9 @@ static int map_app_by_slot(orte_app_context_t* app,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (NULL != orte_rmaps_base.slot_list) {
|
||||||
|
proc->slot_list = strdup(orte_rmaps_base.slot_list);
|
||||||
|
}
|
||||||
/* Update the rank */
|
/* Update the rank */
|
||||||
++num_alloc;
|
++num_alloc;
|
||||||
/* track #slots taken */
|
/* track #slots taken */
|
||||||
@ -279,6 +287,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
orte_rmaps_rank_file_map_t *rfmap;
|
orte_rmaps_rank_file_map_t *rfmap;
|
||||||
orte_std_cntr_t slots_per_node, relative_index, tmp_cnt;
|
orte_std_cntr_t slots_per_node, relative_index, tmp_cnt;
|
||||||
int rc;
|
int rc;
|
||||||
|
orte_proc_t *proc;
|
||||||
|
|
||||||
/* convenience def */
|
/* convenience def */
|
||||||
map = jdata->map;
|
map = jdata->map;
|
||||||
@ -303,7 +312,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* likewise, we only support pernode options for a single app_context */
|
/* likewise, we only support pernode options for a single app_context */
|
||||||
if (map->pernode && 1 < jdata->num_apps) {
|
if (0 < map->npernode && 1 < jdata->num_apps) {
|
||||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
|
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
|
||||||
true, jdata->num_apps, NULL);
|
true, jdata->num_apps, NULL);
|
||||||
rc = ORTE_ERR_SILENT;
|
rc = ORTE_ERR_SILENT;
|
||||||
@ -349,7 +358,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
||||||
|
|
||||||
/* we already checked for sanity, so these are okay to just do here */
|
/* we already checked for sanity, so these are okay to just do here */
|
||||||
if (map->pernode && map->npernode == 1) {
|
if (map->npernode == 1) {
|
||||||
/* there are three use-cases that we need to deal with:
|
/* there are three use-cases that we need to deal with:
|
||||||
* (a) if -np was not provided, then we just use the number of nodes
|
* (a) if -np was not provided, then we just use the number of nodes
|
||||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||||
@ -365,7 +374,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
rc = ORTE_ERR_SILENT;
|
rc = ORTE_ERR_SILENT;
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
} else if (map->pernode && map->npernode > 1) {
|
} else if (map->npernode > 1) {
|
||||||
/* first, let's check to see if there are enough slots/node to
|
/* first, let's check to see if there are enough slots/node to
|
||||||
* meet the request - error out if not
|
* meet the request - error out if not
|
||||||
*/
|
*/
|
||||||
@ -447,8 +456,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
|
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
|
||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rank, rfmap->slot_list,
|
proc = NULL;
|
||||||
app->idx, &node_list, jdata->map->oversubscribe, true))) {
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||||
|
&node_list, jdata->map->oversubscribe, true, &proc))) {
|
||||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||||
/* if this is a true error and not the node just being
|
/* if this is a true error and not the node just being
|
||||||
* full, then report the error and abort
|
* full, then report the error and abort
|
||||||
@ -457,6 +467,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
proc->slot_list = strdup(rfmap->slot_list);
|
||||||
jdata->num_procs++;
|
jdata->num_procs++;
|
||||||
}
|
}
|
||||||
/* update the starting point */
|
/* update the starting point */
|
||||||
@ -517,7 +528,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
/* if no bookmark, then just start at the beginning of the list */
|
/* if no bookmark, then just start at the beginning of the list */
|
||||||
cur_node_item = opal_list_get_first(&node_list);
|
cur_node_item = opal_list_get_first(&node_list);
|
||||||
}
|
}
|
||||||
if (map->policy & ORTE_RMAPS_BYNODE) {
|
if (map->policy & ORTE_MAPPING_BYNODE) {
|
||||||
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
|
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
|
||||||
} else {
|
} else {
|
||||||
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
|
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
|
||||||
@ -542,8 +553,14 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
|||||||
/* update the job's number of procs */
|
/* update the job's number of procs */
|
||||||
jdata->num_procs = total_procs;
|
jdata->num_procs = total_procs;
|
||||||
|
|
||||||
|
/* compute vpids and add proc objects to the job */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
/* compute and save convenience values */
|
/* compute and save convenience values */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -37,7 +37,6 @@
|
|||||||
* Local variable
|
* Local variable
|
||||||
*/
|
*/
|
||||||
static opal_list_item_t *cur_node_item = NULL;
|
static opal_list_item_t *cur_node_item = NULL;
|
||||||
static orte_vpid_t vpid_start = 0;
|
|
||||||
|
|
||||||
static char *orte_getline(FILE *fp);
|
static char *orte_getline(FILE *fp);
|
||||||
|
|
||||||
@ -51,24 +50,22 @@ static int rr_map_default(orte_job_t *jdata, orte_app_context_t *app,
|
|||||||
cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata);
|
cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata);
|
||||||
|
|
||||||
/* now perform the mapping */
|
/* now perform the mapping */
|
||||||
if (ORTE_RMAPS_BYNODE & jdata->map->policy) {
|
if (ORTE_MAPPING_BYNODE & jdata->map->policy) {
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_bynode(jdata, app, node_list,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_bynode(jdata, app, node_list,
|
||||||
num_procs, vpid_start,
|
num_procs, cur_node_item))) {
|
||||||
cur_node_item))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_byslot(jdata, app, node_list,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_byslot(jdata, app, node_list,
|
||||||
num_procs, vpid_start,
|
num_procs, cur_node_item))) {
|
||||||
cur_node_item, 0))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update the starting vpid */
|
/* update number of procs */
|
||||||
vpid_start += num_procs;
|
jdata->num_procs += num_procs;
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -123,7 +120,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
float avgload, minload;
|
float avgload, minload;
|
||||||
orte_node_t *node, *nd=NULL, *oldnode;
|
orte_node_t *node, *nd=NULL, *oldnode;
|
||||||
orte_rmaps_res_ftgrp_t *ftgrp, *target;
|
orte_rmaps_res_ftgrp_t *ftgrp, *target;
|
||||||
orte_vpid_t totprocs, lowprocs;
|
orte_vpid_t totprocs, lowprocs, num_assigned;
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
char *ftinput;
|
char *ftinput;
|
||||||
int grp;
|
int grp;
|
||||||
@ -275,8 +272,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
nd->name));
|
nd->name));
|
||||||
/* put proc on the found node */
|
/* put proc on the found node */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
|
||||||
NULL, jdata->map->oversubscribe, false))) {
|
NULL, jdata->map->oversubscribe, false, &proc))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error
|
* really isn't an error
|
||||||
*/
|
*/
|
||||||
@ -290,7 +287,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
/* update the node and local ranks so static ports can
|
/* update the node and local ranks so static ports can
|
||||||
* be properly selected if active
|
* be properly selected if active
|
||||||
*/
|
*/
|
||||||
orte_rmaps_base_update_usage(jdata, oldnode, nd, proc);
|
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* if we did find a target, re-map the proc to the lightest loaded
|
/* if we did find a target, re-map the proc to the lightest loaded
|
||||||
@ -313,8 +310,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name));
|
ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name));
|
||||||
OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */
|
OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */
|
||||||
/* put proc on the found node */
|
/* put proc on the found node */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
|
||||||
NULL, jdata->map->oversubscribe, false))) {
|
NULL, jdata->map->oversubscribe, false, &proc))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error
|
* really isn't an error
|
||||||
*/
|
*/
|
||||||
@ -328,7 +325,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
/* update the node and local ranks so static ports can
|
/* update the node and local ranks so static ports can
|
||||||
* be properly selected if active
|
* be properly selected if active
|
||||||
*/
|
*/
|
||||||
orte_rmaps_base_update_usage(jdata, oldnode, nd, proc);
|
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
|
||||||
}
|
}
|
||||||
/* define the daemons that we will use for this job */
|
/* define the daemons that we will use for this job */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) {
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) {
|
||||||
@ -354,7 +351,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||||
|
|
||||||
/* start at the beginning... */
|
/* start at the beginning... */
|
||||||
vpid_start = 0;
|
|
||||||
jdata->num_procs = 0;
|
jdata->num_procs = 0;
|
||||||
map = jdata->map;
|
map = jdata->map;
|
||||||
|
|
||||||
@ -363,6 +359,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
num_assigned = 0;
|
||||||
/* for each app_context, we have to get the list of nodes that it can
|
/* for each app_context, we have to get the list of nodes that it can
|
||||||
* use since that can now be modified with a hostfile and/or -host
|
* use since that can now be modified with a hostfile and/or -host
|
||||||
* option
|
* option
|
||||||
@ -434,7 +431,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
|
||||||
"%s rmaps:resilient: no available fault group - mapping rr",
|
"%s rmaps:resilient: no available fault group - mapping rr",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-vpid_start))) {
|
if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-num_assigned))) {
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
@ -455,8 +452,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
target->ftgrp, nd->name));
|
target->ftgrp, nd->name));
|
||||||
/* put proc on that node */
|
/* put proc on that node */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, vpid_start, NULL, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx,
|
||||||
&node_list, jdata->map->oversubscribe, false))) {
|
&node_list, jdata->map->oversubscribe, false, NULL))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error
|
* really isn't an error
|
||||||
*/
|
*/
|
||||||
@ -466,7 +463,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* track number of procs mapped */
|
/* track number of procs mapped */
|
||||||
vpid_start++;
|
num_assigned++;
|
||||||
|
|
||||||
/* flag this fault group as used */
|
/* flag this fault group as used */
|
||||||
target->used = true;
|
target->used = true;
|
||||||
@ -484,6 +481,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
|
/* track number of procs */
|
||||||
|
jdata->num_procs += app->num_procs;
|
||||||
/* cleanup the node list - it can differ from one app_context
|
/* cleanup the node list - it can differ from one app_context
|
||||||
* to another, so we have to get it every time
|
* to another, so we have to get it every time
|
||||||
*/
|
*/
|
||||||
@ -493,11 +492,14 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
|||||||
OBJ_DESTRUCT(&node_list);
|
OBJ_DESTRUCT(&node_list);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update the number of procs in the job */
|
/* compute vpids and add proc objects to the job */
|
||||||
jdata->num_procs = vpid_start;
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
/* compute and save convenience values */
|
/* compute and save local ranks */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -25,32 +25,27 @@
|
|||||||
|
|
||||||
#include "opal/class/opal_pointer_array.h"
|
#include "opal/class/opal_pointer_array.h"
|
||||||
|
|
||||||
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* General MAP types - instanced in runtime/orte_globals_class_instances.h
|
* General MAP types - instanced in runtime/orte_globals_class_instances.h
|
||||||
*/
|
*/
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
/*
|
|
||||||
* Define flags indicating the policy used to perform the map
|
|
||||||
*/
|
|
||||||
#define ORTE_RMAPS_NOPOL 0x00
|
|
||||||
#define ORTE_RMAPS_BYNODE 0x01
|
|
||||||
#define ORTE_RMAPS_BYSLOT 0x02
|
|
||||||
#define ORTE_RMAPS_BYUSER 0x04
|
|
||||||
#define ORTE_RMAPS_NO_USE_LOCAL 0x08
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Structure that represents the mapping of a job to an
|
* Structure that represents the mapping of a job to an
|
||||||
* allocated set of resources.
|
* allocated set of resources.
|
||||||
*/
|
*/
|
||||||
struct orte_job_map_t {
|
struct orte_job_map_t {
|
||||||
opal_object_t super;
|
opal_object_t super;
|
||||||
/* save the mapping configuration */
|
/* user-specified mapping params */
|
||||||
uint8_t policy;
|
orte_mapping_policy_t policy;
|
||||||
bool pernode;
|
int npernode;
|
||||||
orte_std_cntr_t npernode;
|
int nperboard;
|
||||||
|
int npersocket;
|
||||||
|
int16_t cpus_per_rank;
|
||||||
|
int16_t stride;
|
||||||
bool oversubscribe;
|
bool oversubscribe;
|
||||||
bool display_map;
|
bool display_map;
|
||||||
bool cpu_lists;
|
bool cpu_lists;
|
||||||
|
@ -48,56 +48,13 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
|||||||
int i;
|
int i;
|
||||||
opal_list_t node_list;
|
opal_list_t node_list;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
orte_vpid_t vpid_start;
|
|
||||||
orte_std_cntr_t num_nodes, num_slots;
|
orte_std_cntr_t num_nodes, num_slots;
|
||||||
int rc;
|
int rc;
|
||||||
orte_std_cntr_t slots_per_node;
|
|
||||||
int ppn = 0;
|
|
||||||
opal_list_item_t *cur_node_item;
|
opal_list_item_t *cur_node_item;
|
||||||
|
|
||||||
/* start at the beginning... */
|
/* start at the beginning... */
|
||||||
vpid_start = 0;
|
|
||||||
jdata->num_procs = 0;
|
jdata->num_procs = 0;
|
||||||
|
|
||||||
/* if loadbalancing is requested, then we need to compute
|
|
||||||
* the #procs/node - note that this cannot be done
|
|
||||||
* if we are doing pernode or if #procs was not given
|
|
||||||
*/
|
|
||||||
if (orte_rmaps_base.loadbalance && !jdata->map->pernode) {
|
|
||||||
float res;
|
|
||||||
/* compute total #procs we are going to add */
|
|
||||||
for(i=0; i < jdata->apps->size; i++) {
|
|
||||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (0 == app->num_procs) {
|
|
||||||
/* can't do it - tell user and quit */
|
|
||||||
orte_show_help("help-orte-rmaps-rr.txt",
|
|
||||||
"orte-rmaps-rr:loadbalance-and-zero-np",
|
|
||||||
true);
|
|
||||||
rc = ORTE_ERR_SILENT;
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
ppn += app->num_procs;
|
|
||||||
}
|
|
||||||
/* get the total avail nodes and the number
|
|
||||||
* of procs already using them
|
|
||||||
*/
|
|
||||||
num_nodes=0;
|
|
||||||
for (i=0; i < orte_node_pool->size; i++) {
|
|
||||||
if (NULL == opal_pointer_array_get_item(orte_node_pool, i)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
num_nodes++;
|
|
||||||
}
|
|
||||||
/* compute the balance */
|
|
||||||
res = ((float)ppn / num_nodes);
|
|
||||||
ppn = ppn / num_nodes;
|
|
||||||
if (0 < (res-ppn)) {
|
|
||||||
ppn++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* cycle through the app_contexts, mapping them sequentially */
|
/* cycle through the app_contexts, mapping them sequentially */
|
||||||
for(i=0; i < jdata->apps->size; i++) {
|
for(i=0; i < jdata->apps->size; i++) {
|
||||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
@ -130,83 +87,22 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
|||||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||||
cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||||
|
|
||||||
if (jdata->map->pernode && jdata->map->npernode == 1) {
|
if (0 == app->num_procs) {
|
||||||
/* there are three use-cases that we need to deal with:
|
/* set the num_procs to equal the number of slots on these mapped nodes */
|
||||||
* (a) if -np was not provided, then we just use the number of nodes
|
|
||||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
|
||||||
* (c) if -np was provided AND #procs <= #nodes, then launch
|
|
||||||
* the specified #procs one/node. In this case, we just
|
|
||||||
* leave app->num_procs alone
|
|
||||||
*/
|
|
||||||
if (0 == app->num_procs) {
|
|
||||||
app->num_procs = num_nodes;
|
|
||||||
} else if (app->num_procs > num_nodes) {
|
|
||||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs",
|
|
||||||
true, app->num_procs, num_nodes, NULL);
|
|
||||||
rc = ORTE_ERR_SILENT;
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
} else if (jdata->map->pernode && jdata->map->npernode > 1) {
|
|
||||||
/* first, let's check to see if there are enough slots/node to
|
|
||||||
* meet the request - error out if not
|
|
||||||
*/
|
|
||||||
slots_per_node = num_slots / num_nodes;
|
|
||||||
if (jdata->map->npernode > slots_per_node) {
|
|
||||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-not-enough-slots",
|
|
||||||
true, jdata->map->npernode, slots_per_node, NULL);
|
|
||||||
rc = ORTE_ERR_SILENT;
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
/* there are three use-cases that we need to deal with:
|
|
||||||
* (a) if -np was not provided, then we just use the n/node * #nodes
|
|
||||||
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
|
|
||||||
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
|
|
||||||
* the specified #procs n/node. In this case, we just
|
|
||||||
* leave app->num_procs alone
|
|
||||||
*/
|
|
||||||
if (0 == app->num_procs) {
|
|
||||||
/* set the num_procs to equal the specified num/node * the number of nodes */
|
|
||||||
app->num_procs = jdata->map->npernode * num_nodes;
|
|
||||||
} else if (app->num_procs > (jdata->map->npernode * num_nodes)) {
|
|
||||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-too-many-procs",
|
|
||||||
true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL);
|
|
||||||
rc = ORTE_ERR_SILENT;
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
} else if (0 == app->num_procs) {
|
|
||||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER) {
|
|
||||||
/* we can't handle this - it should have been set when we got
|
|
||||||
* the map info. If it wasn't, then we can only error out
|
|
||||||
*/
|
|
||||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-np-and-user-map",
|
|
||||||
true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL);
|
|
||||||
rc = ORTE_ERR_SILENT;
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
/** set the num_procs to equal the number of slots on these mapped nodes */
|
|
||||||
app->num_procs = num_slots;
|
app->num_procs = num_slots;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** track the total number of processes we mapped */
|
/* track the total number of processes we mapped */
|
||||||
jdata->num_procs += app->num_procs;
|
jdata->num_procs += app->num_procs;
|
||||||
|
|
||||||
/* Make assignments */
|
/* Make assignments */
|
||||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER) {
|
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
|
||||||
rc = ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
goto error;
|
|
||||||
} else if (jdata->map->policy & ORTE_RMAPS_BYNODE) {
|
|
||||||
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list,
|
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list,
|
||||||
app->num_procs, vpid_start,
|
app->num_procs, cur_node_item);
|
||||||
cur_node_item);
|
|
||||||
} else {
|
} else {
|
||||||
rc = orte_rmaps_base_map_byslot(jdata, app, &node_list,
|
rc = orte_rmaps_base_map_byslot(jdata, app, &node_list,
|
||||||
app->num_procs, vpid_start,
|
app->num_procs, cur_node_item);
|
||||||
cur_node_item, ppn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update the starting vpid for the next app_context */
|
|
||||||
vpid_start += app->num_procs;
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto error;
|
goto error;
|
||||||
@ -221,8 +117,14 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
|||||||
OBJ_DESTRUCT(&node_list);
|
OBJ_DESTRUCT(&node_list);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* compute and save convenience values */
|
/* compute vpids and add proc objects to the job */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* compute and save local ranks */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -59,13 +59,14 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
|||||||
orte_job_map_t *map;
|
orte_job_map_t *map;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
orte_std_cntr_t i, j;
|
orte_std_cntr_t i, j;
|
||||||
opal_list_item_t *item, *next, *cur_node_item;
|
opal_list_item_t *item;
|
||||||
orte_node_t *node, *nd;
|
orte_node_t *node, *nd, *save;
|
||||||
orte_vpid_t vpid;
|
orte_vpid_t vpid;
|
||||||
orte_std_cntr_t num_nodes;
|
orte_std_cntr_t num_nodes;
|
||||||
int rc;
|
int rc;
|
||||||
opal_list_t *default_node_list=NULL;
|
opal_list_t *default_node_list=NULL;
|
||||||
opal_list_t *node_list=NULL;
|
opal_list_t *node_list=NULL;
|
||||||
|
orte_proc_t *proc;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||||
"%s rmaps:seq mapping job %s",
|
"%s rmaps:seq mapping job %s",
|
||||||
@ -87,6 +88,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
|||||||
/* start at the beginning... */
|
/* start at the beginning... */
|
||||||
vpid = 0;
|
vpid = 0;
|
||||||
jdata->num_procs = 0;
|
jdata->num_procs = 0;
|
||||||
|
if (NULL != default_node_list) {
|
||||||
|
save = (orte_node_t*)opal_list_get_first(default_node_list);
|
||||||
|
}
|
||||||
|
|
||||||
/* cycle through the app_contexts, mapping them sequentially */
|
/* cycle through the app_contexts, mapping them sequentially */
|
||||||
for(i=0; i < jdata->num_apps; i++) {
|
for(i=0; i < jdata->num_apps; i++) {
|
||||||
@ -103,12 +107,14 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
nd = (orte_node_t*)opal_list_get_first(node_list);
|
||||||
} else {
|
} else {
|
||||||
node_list = default_node_list;
|
node_list = default_node_list;
|
||||||
|
nd = save;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check for nolocal and remove the head node, if required */
|
/* check for nolocal and remove the head node, if required */
|
||||||
if (map->policy & ORTE_RMAPS_NO_USE_LOCAL) {
|
if (map->policy & ORTE_MAPPING_NO_USE_LOCAL) {
|
||||||
for (item = opal_list_get_first(node_list);
|
for (item = opal_list_get_first(node_list);
|
||||||
item != opal_list_get_end(node_list);
|
item != opal_list_get_end(node_list);
|
||||||
item = opal_list_get_next(item) ) {
|
item = opal_list_get_next(item) ) {
|
||||||
@ -132,43 +138,17 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
|||||||
return ORTE_ERR_SILENT;
|
return ORTE_ERR_SILENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
|
||||||
cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata);
|
|
||||||
|
|
||||||
/* if num_procs wasn't specified, set it now */
|
/* if num_procs wasn't specified, set it now */
|
||||||
if (0 == app->num_procs) {
|
if (0 == app->num_procs) {
|
||||||
app->num_procs = num_nodes;
|
app->num_procs = num_nodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i=0; i < app->num_procs; i++) {
|
for (i=0; i < app->num_procs; i++) {
|
||||||
/* see if any nodes remain unused and available. We need to do this check
|
|
||||||
* each time since we may remove nodes from the list (as they become fully
|
|
||||||
* used) as we cycle through the loop
|
|
||||||
*/
|
|
||||||
if(0 >= opal_list_get_size(node_list) ) {
|
|
||||||
/* Everything is at max usage! :( */
|
|
||||||
orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:alloc-error",
|
|
||||||
true, app->num_procs, app->app);
|
|
||||||
return ORTE_ERR_SILENT;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Save the next node we can use before claiming slots, since
|
|
||||||
* we may need to prune the nodes list removing overused nodes.
|
|
||||||
* Wrap around to beginning if we are at the end of the list
|
|
||||||
*/
|
|
||||||
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
|
|
||||||
next = opal_list_get_first(node_list);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
next = opal_list_get_next(cur_node_item);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* find this node on the global array - this is necessary so
|
/* find this node on the global array - this is necessary so
|
||||||
* that our mapping gets saved on that array as the objects
|
* that our mapping gets saved on that array as the objects
|
||||||
* returned by the hostfile function are -not- on the array
|
* returned by the hostfile function are -not- on the array
|
||||||
*/
|
*/
|
||||||
node = NULL;
|
node = NULL;
|
||||||
nd = (orte_node_t*)cur_node_item;
|
|
||||||
for (j=0; j < orte_node_pool->size; j++) {
|
for (j=0; j < orte_node_pool->size; j++) {
|
||||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
|
||||||
continue;
|
continue;
|
||||||
@ -186,42 +166,46 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* assign next vpid to this node - do NOT allow claim_slot to remove
|
/* assign proc to this node - do NOT allow claim_slot to remove
|
||||||
* an oversubscribed node from the list!
|
* an oversubscribed node from the list!
|
||||||
*/
|
*/
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||||
vpid, NULL, app->idx,
|
jdata->map->cpus_per_rank, app->idx,
|
||||||
node_list,
|
node_list,
|
||||||
jdata->map->oversubscribe,
|
jdata->map->oversubscribe,
|
||||||
false))) {
|
false, &proc))) {
|
||||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* increment the vpid */
|
/* assign the vpid */
|
||||||
vpid++;
|
proc->name.vpid = vpid++;
|
||||||
|
/* add to the jdata proc array */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
/* move to next node */
|
/* move to next node */
|
||||||
cur_node_item = next;
|
nd = (orte_node_t*)opal_list_get_next((opal_list_item_t*)nd);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** track the total number of processes we mapped */
|
/** track the total number of processes we mapped */
|
||||||
jdata->num_procs += app->num_procs;
|
jdata->num_procs += app->num_procs;
|
||||||
|
|
||||||
/* update the bookmark */
|
|
||||||
jdata->bookmark = (orte_node_t*)cur_node_item;
|
|
||||||
|
|
||||||
/* cleanup the node list if it came from this app_context */
|
/* cleanup the node list if it came from this app_context */
|
||||||
if (node_list != default_node_list) {
|
if (node_list != default_node_list) {
|
||||||
while(NULL != (item = opal_list_remove_first(node_list))) {
|
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
OBJ_RELEASE(node_list);
|
OBJ_RELEASE(node_list);
|
||||||
|
} else {
|
||||||
|
save = nd;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* compute and save convenience values */
|
/* compute and save local ranks */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -110,8 +110,8 @@ static int map_app_by_node(
|
|||||||
|
|
||||||
/* Allocate a slot on this node */
|
/* Allocate a slot on this node */
|
||||||
node = (orte_node_t*) cur_node_item;
|
node = (orte_node_t*) cur_node_item;
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||||
nodes, jdata->map->oversubscribe, true))) {
|
nodes, jdata->map->oversubscribe, true, NULL))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
* since the node is fully used up. For now, just don't report
|
* since the node is fully used up. For now, just don't report
|
||||||
@ -212,13 +212,13 @@ static int map_app_by_slot(
|
|||||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||||
* to the num_per_node
|
* to the num_per_node
|
||||||
*/
|
*/
|
||||||
if (jdata->map->pernode) {
|
if (0 < jdata->map->npernode) {
|
||||||
num_slots_to_take = jdata->map->npernode;
|
num_slots_to_take = jdata->map->npernode;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < num_slots_to_take; ++i) {
|
for( i = 0; i < num_slots_to_take; ++i) {
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||||
nodes, jdata->map->oversubscribe, true))) {
|
nodes, jdata->map->oversubscribe, true, NULL))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
* since the node is fully used up. For now, just don't report
|
* since the node is fully used up. For now, just don't report
|
||||||
@ -426,7 +426,7 @@ static int topo_map(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
proceed:
|
proceed:
|
||||||
if (map->pernode && map->npernode == 1) {
|
if (map->npernode == 1) {
|
||||||
/* there are three use-cases that we need to deal with:
|
/* there are three use-cases that we need to deal with:
|
||||||
* (a) if -np was not provided, then we just use the number of nodes
|
* (a) if -np was not provided, then we just use the number of nodes
|
||||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||||
@ -442,7 +442,7 @@ static int topo_map(orte_job_t *jdata)
|
|||||||
rc = ORTE_ERR_SILENT;
|
rc = ORTE_ERR_SILENT;
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
} else if (map->pernode && map->npernode > 1) {
|
} else if (map->npernode > 1) {
|
||||||
/* first, let's check to see if there are enough slots/node to
|
/* first, let's check to see if there are enough slots/node to
|
||||||
* meet the request - error out if not
|
* meet the request - error out if not
|
||||||
*/
|
*/
|
||||||
@ -473,11 +473,11 @@ static int topo_map(orte_job_t *jdata)
|
|||||||
/** set the num_procs to equal the number of slots on these mapped nodes - if
|
/** set the num_procs to equal the number of slots on these mapped nodes - if
|
||||||
user has specified "-bynode", then set it to the number of nodes
|
user has specified "-bynode", then set it to the number of nodes
|
||||||
*/
|
*/
|
||||||
if (map->policy & ORTE_RMAPS_BYNODE) {
|
if (map->policy & ORTE_MAPPING_BYNODE) {
|
||||||
app->num_procs = num_nodes;
|
app->num_procs = num_nodes;
|
||||||
} else if (map->policy & ORTE_RMAPS_BYSLOT) {
|
} else if (map->policy & ORTE_MAPPING_BYSLOT) {
|
||||||
app->num_procs = num_slots;
|
app->num_procs = num_slots;
|
||||||
} else if (map->policy & ORTE_RMAPS_BYUSER) {
|
} else {
|
||||||
/* we can't handle this - it should have been set when we got
|
/* we can't handle this - it should have been set when we got
|
||||||
* the map info. If it wasn't, then we can only error out
|
* the map info. If it wasn't, then we can only error out
|
||||||
*/
|
*/
|
||||||
@ -492,10 +492,7 @@ static int topo_map(orte_job_t *jdata)
|
|||||||
jdata->num_procs += app->num_procs;
|
jdata->num_procs += app->num_procs;
|
||||||
|
|
||||||
/* Make assignments */
|
/* Make assignments */
|
||||||
if (map->policy == ORTE_RMAPS_BYUSER) {
|
if (map->policy == ORTE_MAPPING_BYNODE) {
|
||||||
rc = ORTE_ERR_NOT_IMPLEMENTED;
|
|
||||||
goto error;
|
|
||||||
} else if (map->policy == ORTE_RMAPS_BYNODE) {
|
|
||||||
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
|
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
|
||||||
} else {
|
} else {
|
||||||
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
|
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
|
||||||
@ -522,7 +519,7 @@ static int topo_map(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* compute and save convenience values */
|
/* compute and save convenience values */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -280,7 +280,6 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
|
|||||||
|
|
||||||
/* copy data into it */
|
/* copy data into it */
|
||||||
(*dest)->policy = src->policy;
|
(*dest)->policy = src->policy;
|
||||||
(*dest)->pernode = src->pernode;
|
|
||||||
(*dest)->npernode = src->npernode;
|
(*dest)->npernode = src->npernode;
|
||||||
(*dest)->oversubscribe = src->oversubscribe;
|
(*dest)->oversubscribe = src->oversubscribe;
|
||||||
(*dest)->display_map = src->display_map;
|
(*dest)->display_map = src->display_map;
|
||||||
|
@ -407,6 +407,15 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* do not pack the local board, socket, and core info */
|
||||||
|
|
||||||
|
/* pack the cpu set info */
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||||
|
(void*)(&(nodes[i]->cpu_set)), 1, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
/* do not pack the username */
|
/* do not pack the username */
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
@ -814,13 +823,7 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
|
|||||||
|
|
||||||
for (i=0; i < num_vals; i++) {
|
for (i=0; i < num_vals; i++) {
|
||||||
/* pack the policy used to generate it */
|
/* pack the policy used to generate it */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, OPAL_UINT8))) {
|
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, ORTE_MAPPING_POLICY))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* pack the pernode flag */
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->pernode), 1, OPAL_BOOL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -362,6 +362,11 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
asprintf(&tmp2, "%s\n%s\tNum boards: %ld\tNum sockets/board: %ld\tNum cores/socket: %ld", tmp, pfx2,
|
||||||
|
(long)src->boards, (long)src->sockets_per_board, (long)src->cores_per_socket);
|
||||||
|
free(tmp);
|
||||||
|
tmp = tmp2;
|
||||||
|
|
||||||
if (NULL == src->daemon) {
|
if (NULL == src->daemon) {
|
||||||
asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2,
|
asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2,
|
||||||
"Not defined", src->daemon_launched ? "True" : "False");
|
"Not defined", src->daemon_launched ? "True" : "False");
|
||||||
@ -377,8 +382,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
|||||||
free(tmp);
|
free(tmp);
|
||||||
tmp = tmp2;
|
tmp = tmp2;
|
||||||
|
|
||||||
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
|
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld:\tCpu set: %s", tmp, pfx2,
|
||||||
(long)src->slots_alloc, (long)src->slots_max);
|
(long)src->slots_alloc, (long)src->slots_max,
|
||||||
|
(NULL == src->cpu_set) ? "NULL" : src->cpu_set);
|
||||||
free(tmp);
|
free(tmp);
|
||||||
tmp = tmp2;
|
tmp = tmp2;
|
||||||
|
|
||||||
@ -644,9 +650,8 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
|
|||||||
asprintf(&pfx, "%s\t", pfx2);
|
asprintf(&pfx, "%s\t", pfx2);
|
||||||
|
|
||||||
if (orte_devel_level_output) {
|
if (orte_devel_level_output) {
|
||||||
asprintf(&tmp, "\n%sMap generated by mapping policy: %x\n%s\tPernode: %s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
|
asprintf(&tmp, "\n%sMap generated by mapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
|
||||||
pfx2, src->policy, pfx2,
|
pfx2, src->policy, pfx2, (long)src->npernode,
|
||||||
(src->pernode) ? "TRUE" : "FALSE", (long)src->npernode,
|
|
||||||
(src->oversubscribe) ? "TRUE" : "FALSE",
|
(src->oversubscribe) ? "TRUE" : "FALSE",
|
||||||
(src->cpu_lists) ? "TRUE" : "FALSE");
|
(src->cpu_lists) ? "TRUE" : "FALSE");
|
||||||
|
|
||||||
|
@ -422,6 +422,16 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* do not unpack the board, socket, and core info */
|
||||||
|
|
||||||
|
/* unpack the cpu set */
|
||||||
|
n = 1;
|
||||||
|
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||||
|
&(nodes[i]->cpu_set), &n, OPAL_STRING))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
/* do not unpack the username */
|
/* do not unpack the username */
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
@ -883,15 +893,7 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
|
|||||||
/* unpack the policy */
|
/* unpack the policy */
|
||||||
n = 1;
|
n = 1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||||
&(maps[i]->policy), &n, OPAL_UINT8))) {
|
&(maps[i]->policy), &n, ORTE_MAPPING_POLICY))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* unpack the pernode flag */
|
|
||||||
n = 1;
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
|
||||||
&(maps[i]->pernode), &n, OPAL_BOOL))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
#include "opal/mca/paffinity/paffinity.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/class/opal_pointer_array.h"
|
#include "opal/class/opal_pointer_array.h"
|
||||||
@ -132,6 +133,17 @@ bool orte_orted_exit_with_barrier = true;
|
|||||||
/* report launch progress */
|
/* report launch progress */
|
||||||
bool orte_report_launch_progress = false;
|
bool orte_report_launch_progress = false;
|
||||||
|
|
||||||
|
/* cluster hardware info */
|
||||||
|
uint8_t orte_default_num_boards;
|
||||||
|
uint8_t orte_default_num_sockets_per_board;
|
||||||
|
uint8_t orte_default_num_cores_per_socket;
|
||||||
|
|
||||||
|
/* allocation specification */
|
||||||
|
char *orte_default_cpu_set;
|
||||||
|
|
||||||
|
/* default rank assigment and binding policy */
|
||||||
|
orte_mapping_policy_t orte_default_mapping_policy = 0;
|
||||||
|
|
||||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||||
|
|
||||||
int orte_debug_output = -1;
|
int orte_debug_output = -1;
|
||||||
@ -670,6 +682,16 @@ static void orte_node_construct(orte_node_t* node)
|
|||||||
node->slots_inuse = 0;
|
node->slots_inuse = 0;
|
||||||
node->slots_alloc = 0;
|
node->slots_alloc = 0;
|
||||||
node->slots_max = 0;
|
node->slots_max = 0;
|
||||||
|
|
||||||
|
node->boards = orte_default_num_boards;
|
||||||
|
node->sockets_per_board = orte_default_num_sockets_per_board;
|
||||||
|
node->cores_per_socket = orte_default_num_cores_per_socket;
|
||||||
|
if (NULL != orte_default_cpu_set) {
|
||||||
|
node->cpu_set = strdup(orte_default_cpu_set);
|
||||||
|
} else {
|
||||||
|
node->cpu_set = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
node->username = NULL;
|
node->username = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -702,6 +724,10 @@ static void orte_node_destruct(orte_node_t* node)
|
|||||||
}
|
}
|
||||||
OBJ_RELEASE(node->procs);
|
OBJ_RELEASE(node->procs);
|
||||||
|
|
||||||
|
if (NULL != node->cpu_set) {
|
||||||
|
free(node->cpu_set);
|
||||||
|
node->cpu_set = NULL;
|
||||||
|
}
|
||||||
if (NULL != node->username) {
|
if (NULL != node->username) {
|
||||||
free(node->username);
|
free(node->username);
|
||||||
node->username = NULL;
|
node->username = NULL;
|
||||||
@ -871,9 +897,12 @@ OBJ_CLASS_INSTANCE(orte_jmap_t,
|
|||||||
|
|
||||||
static void orte_job_map_construct(orte_job_map_t* map)
|
static void orte_job_map_construct(orte_job_map_t* map)
|
||||||
{
|
{
|
||||||
map->policy = ORTE_RMAPS_BYSLOT; /* default to byslot mapping as per orterun options */
|
map->policy = 0;
|
||||||
map->pernode = false;
|
|
||||||
map->npernode = 0;
|
map->npernode = 0;
|
||||||
|
map->nperboard = 0;
|
||||||
|
map->npersocket = 0;
|
||||||
|
map->cpus_per_rank = 1;
|
||||||
|
map->stride = 1;
|
||||||
map->oversubscribe = true; /* default to allowing oversubscribe */
|
map->oversubscribe = true; /* default to allowing oversubscribe */
|
||||||
map->display_map = false;
|
map->display_map = false;
|
||||||
map->cpu_lists = false;
|
map->cpu_lists = false;
|
||||||
|
@ -38,7 +38,6 @@
|
|||||||
#include "opal/class/opal_value_array.h"
|
#include "opal/class/opal_value_array.h"
|
||||||
|
|
||||||
#include "orte/mca/plm/plm_types.h"
|
#include "orte/mca/plm/plm_types.h"
|
||||||
#include "orte/mca/rmaps/rmaps_types.h"
|
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
@ -141,6 +140,7 @@ typedef struct orte_job_t orte_job_t;
|
|||||||
* defining it - resolves potential circular definition
|
* defining it - resolves potential circular definition
|
||||||
*/
|
*/
|
||||||
struct orte_proc_t;
|
struct orte_proc_t;
|
||||||
|
struct orte_job_map_t;
|
||||||
/************/
|
/************/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -241,6 +241,14 @@ typedef struct {
|
|||||||
specified limit. For example, if we have two processors, we
|
specified limit. For example, if we have two processors, we
|
||||||
may want to allow up to four processes but no more. */
|
may want to allow up to four processes but no more. */
|
||||||
orte_std_cntr_t slots_max;
|
orte_std_cntr_t slots_max;
|
||||||
|
/* number of physical boards in the node - defaults to 1 */
|
||||||
|
uint8_t boards;
|
||||||
|
/* number of sockets on each board - defaults to 1 */
|
||||||
|
uint8_t sockets_per_board;
|
||||||
|
/* number of cores per socket - defaults to 1 */
|
||||||
|
uint8_t cores_per_socket;
|
||||||
|
/* cpus on this node that are assigned for our use */
|
||||||
|
char *cpu_set;
|
||||||
/** Username on this node, if specified */
|
/** Username on this node, if specified */
|
||||||
char *username;
|
char *username;
|
||||||
} orte_node_t;
|
} orte_node_t;
|
||||||
@ -258,6 +266,31 @@ typedef uint8_t orte_job_controls_t;
|
|||||||
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
|
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
|
||||||
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
|
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
|
||||||
|
|
||||||
|
typedef uint16_t orte_mapping_policy_t;
|
||||||
|
#define ORTE_MAPPING_POLICY OPAL_UINT16
|
||||||
|
/* put the rank assignment method in the upper 8 bits */
|
||||||
|
#define ORTE_MAPPING_NOPOL 0x0100
|
||||||
|
#define ORTE_MAPPING_BYNODE 0x0200
|
||||||
|
#define ORTE_MAPPING_BYSLOT 0x0400
|
||||||
|
#define ORTE_MAPPING_BYSOCKET 0x0800
|
||||||
|
#define ORTE_MAPPING_BYBOARD 0x1000
|
||||||
|
#define ORTE_MAPPING_NO_USE_LOCAL 0x2000
|
||||||
|
#define ORTE_MAPPING_NPERXXX 0x4000
|
||||||
|
/* nice macro for setting these */
|
||||||
|
#define ORTE_SET_MAPPING_POLICY(pol) \
|
||||||
|
orte_default_mapping_policy = (orte_default_mapping_policy & 0x00ff) | (pol);
|
||||||
|
#define ORTE_ADD_MAPPING_POLICY(pol) \
|
||||||
|
orte_default_mapping_policy |= (pol);
|
||||||
|
|
||||||
|
/* put the binding policy in the lower 8 bits, using the paffinity values */
|
||||||
|
#define ORTE_BIND_TO_NONE (uint16_t)OPAL_PAFFINITY_DO_NOT_BIND
|
||||||
|
#define ORTE_BIND_TO_CORE (uint16_t)OPAL_PAFFINITY_BIND_TO_CORE
|
||||||
|
#define ORTE_BIND_TO_SOCKET (uint16_t)OPAL_PAFFINITY_BIND_TO_SOCKET
|
||||||
|
#define ORTE_BIND_TO_BOARD (uint16_t)OPAL_PAFFINITY_BIND_TO_BOARD
|
||||||
|
/* nice macro for setting these */
|
||||||
|
#define ORTE_SET_BINDING_POLICY(pol) \
|
||||||
|
orte_default_mapping_policy = (orte_default_mapping_policy & 0xff00) | (pol);
|
||||||
|
|
||||||
/* error manager callback function */
|
/* error manager callback function */
|
||||||
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
|
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
|
||||||
|
|
||||||
@ -285,7 +318,7 @@ typedef struct {
|
|||||||
/* array of pointers to procs in this job */
|
/* array of pointers to procs in this job */
|
||||||
opal_pointer_array_t *procs;
|
opal_pointer_array_t *procs;
|
||||||
/* map of the job */
|
/* map of the job */
|
||||||
orte_job_map_t *map;
|
struct orte_job_map_t *map;
|
||||||
/* bookmark for where we are in mapping - this
|
/* bookmark for where we are in mapping - this
|
||||||
* indicates the node where we stopped
|
* indicates the node where we stopped
|
||||||
*/
|
*/
|
||||||
@ -531,6 +564,17 @@ ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
|
|||||||
/* whether or not to report launch progress */
|
/* whether or not to report launch progress */
|
||||||
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
||||||
|
|
||||||
|
/* cluster hardware info */
|
||||||
|
ORTE_DECLSPEC extern uint8_t orte_default_num_boards;
|
||||||
|
ORTE_DECLSPEC extern uint8_t orte_default_num_sockets_per_board;
|
||||||
|
ORTE_DECLSPEC extern uint8_t orte_default_num_cores_per_socket;
|
||||||
|
|
||||||
|
/* allocation specification */
|
||||||
|
ORTE_DECLSPEC extern char *orte_default_cpu_set;
|
||||||
|
|
||||||
|
/* default rank assigment and binding policy */
|
||||||
|
ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy;
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "opal/mca/base/mca_base_param.h"
|
#include "opal/mca/base/mca_base_param.h"
|
||||||
|
#include "opal/mca/paffinity/base/base.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
@ -38,6 +39,7 @@
|
|||||||
int orte_register_params(void)
|
int orte_register_params(void)
|
||||||
{
|
{
|
||||||
int value, tmp;
|
int value, tmp;
|
||||||
|
char *strval;
|
||||||
|
|
||||||
mca_base_param_reg_int_name("orte", "base_help_aggregate",
|
mca_base_param_reg_int_name("orte", "base_help_aggregate",
|
||||||
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
|
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
|
||||||
@ -297,6 +299,48 @@ int orte_register_params(void)
|
|||||||
orte_startup_timeout = 2000; /* default to 2 seconds */
|
orte_startup_timeout = 2000; /* default to 2 seconds */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* cluster hardware info */
|
||||||
|
mca_base_param_reg_int_name("orte", "num_boards",
|
||||||
|
"Number of processor boards/node (1-256) [default: 1]",
|
||||||
|
false, false, 1, &value);
|
||||||
|
orte_default_num_boards = (uint8_t)value;
|
||||||
|
if (OPAL_SUCCESS != opal_paffinity_base_get_socket_info(&value)) {
|
||||||
|
value = 1;
|
||||||
|
}
|
||||||
|
mca_base_param_reg_int_name("orte", "num_sockets",
|
||||||
|
"Number of sockets/board (1-256) [default: auto-sensed by mpirun or 1]",
|
||||||
|
false, false, value, &value);
|
||||||
|
orte_default_num_sockets_per_board = (uint8_t)value;
|
||||||
|
if (OPAL_SUCCESS != opal_paffinity_base_get_core_info(0, &value)) {
|
||||||
|
value = 1;
|
||||||
|
}
|
||||||
|
mca_base_param_reg_int_name("orte", "num_cores",
|
||||||
|
"Number of cores/socket (1-256) [default: auto-sensed by mpirun or 1]",
|
||||||
|
false, false, value, &value);
|
||||||
|
orte_default_num_cores_per_socket = (uint8_t)value;
|
||||||
|
|
||||||
|
/* cpu allocation specification */
|
||||||
|
mca_base_param_reg_string_name("orte", "cpu_set",
|
||||||
|
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
|
||||||
|
false, false, NULL, &orte_default_cpu_set);
|
||||||
|
|
||||||
|
/* binding specification - this will be overridden by any cmd line directive, and
|
||||||
|
* ignored unless opal_paffinity_alone is set
|
||||||
|
*/
|
||||||
|
mca_base_param_reg_string_name("orte", "process_binding",
|
||||||
|
"Policy for binding processes [core | socket | board (default: none)]",
|
||||||
|
false, false, NULL, &strval);
|
||||||
|
if (NULL != strval) {
|
||||||
|
if (0 == strcmp(strval, "socket")) {
|
||||||
|
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
|
||||||
|
} else if (0 == strcmp(strval, "board")) {
|
||||||
|
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
|
||||||
|
} else if (0 == strcmp(strval, "core")) {
|
||||||
|
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
@ -120,6 +120,7 @@
|
|||||||
#include "orte/mca/rml/rml_types.h"
|
#include "orte/mca/rml/rml_types.h"
|
||||||
#include "orte/mca/plm/plm.h"
|
#include "orte/mca/plm/plm.h"
|
||||||
#include "orte/mca/plm/base/plm_private.h"
|
#include "orte/mca/plm/base/plm_private.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps_types.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
@ -512,7 +513,6 @@ static void check_debugger(int fd, short event, void *arg)
|
|||||||
* one debugger daemon on each node
|
* one debugger daemon on each node
|
||||||
*/
|
*/
|
||||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||||
jdata->map->pernode = true;
|
|
||||||
jdata->map->npernode = 1;
|
jdata->map->npernode = 1;
|
||||||
/* add it to the global job pool */
|
/* add it to the global job pool */
|
||||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||||
|
@ -50,6 +50,7 @@
|
|||||||
#include "opal/event/event.h"
|
#include "opal/event/event.h"
|
||||||
#include "opal/mca/installdirs/installdirs.h"
|
#include "opal/mca/installdirs/installdirs.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
|
#include "opal/mca/paffinity/base/base.h"
|
||||||
#include "opal/util/argv.h"
|
#include "opal/util/argv.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/basename.h"
|
#include "opal/util/basename.h"
|
||||||
@ -255,10 +256,16 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
/* Mapping options */
|
/* Mapping options */
|
||||||
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
|
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
|
||||||
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
|
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Whether to allocate/map processes round-robin by node" },
|
"Whether to assign processes round-robin by node" },
|
||||||
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
|
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
|
||||||
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Whether to allocate/map processes round-robin by slot (the default)" },
|
"Whether to assign processes round-robin by slot (the default)" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "bysocket", "bysocket", 0,
|
||||||
|
&orterun_globals.by_socket, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Whether to assign processes round-robin by socket" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "byboard", "byboard", 0,
|
||||||
|
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Whether to assign processes round-robin by board (equivalent to bynode if only 1 board/node)" },
|
||||||
{ "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,
|
{ "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
|
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
|
||||||
@ -286,6 +293,29 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
{ "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,
|
{ "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Do not run any MPI applications on the local node" },
|
"Do not run any MPI applications on the local node" },
|
||||||
|
{ "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Number of cpus to use for each rank [default=1]" },
|
||||||
|
{ "rmaps", "base", "n_perboard", '\0', "nperboard", "nperboard", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Launch n processes per board on all allocated nodes" },
|
||||||
|
{ "rmaps", "base", "n_persocket", '\0', "npersocket", "npersocket", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Launch n processes per socket on all allocated nodes" },
|
||||||
|
|
||||||
|
/* binding options */
|
||||||
|
{ NULL, NULL, NULL, '\0', "bind-to-core", "bind-to-core", 0,
|
||||||
|
&orterun_globals.bind_to_core, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Whether to bind processes to specific cores (the default)" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "bind-to-board", "bind-to-board", 0,
|
||||||
|
&orterun_globals.bind_to_board, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Whether to bind processes to specific boards (meaningless on 1 board/node)" },
|
||||||
|
{ NULL, NULL, NULL, '\0', "bind-to-socket", "bind-to-socket", 0,
|
||||||
|
&orterun_globals.bind_to_socket, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Whether to bind processes to sockets" },
|
||||||
|
{ "rmaps", "base", "stride", '\0', "stride", "stride", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"When binding multiple cores to a rank, the step size to use between cores [default: 1]" },
|
||||||
|
|
||||||
/* Allocation options */
|
/* Allocation options */
|
||||||
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
|
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
|
||||||
@ -294,6 +324,20 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
|
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
|
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
|
||||||
|
{ "orte", "cpu", "set", '\0', "cpu-set", "cpu-set", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
|
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"},
|
||||||
|
|
||||||
|
/* cluster hardware info */
|
||||||
|
{ "orte", "num", "boards", '\0', "num-boards", "num-boards", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Number of processor boards/node (1-256) [default: 1]"},
|
||||||
|
{ "orte", "num", "sockets", '\0', "num-sockets", "num-sockets", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Number of sockets/board (1-256) [default: 1]"},
|
||||||
|
{ "orte", "num", "cores", '\0', "num-cores", "num-cores", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Number of cores/socket (1-256) [default: 1]"},
|
||||||
|
|
||||||
/* mpiexec-like arguments */
|
/* mpiexec-like arguments */
|
||||||
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||||
@ -468,6 +512,7 @@ int orterun(int argc, char *argv[])
|
|||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check what user wants us to do with stdin */
|
/* check what user wants us to do with stdin */
|
||||||
if (0 == strcmp(orterun_globals.stdin_target, "all")) {
|
if (0 == strcmp(orterun_globals.stdin_target, "all")) {
|
||||||
jdata->stdin_target = ORTE_VPID_WILDCARD;
|
jdata->stdin_target = ORTE_VPID_WILDCARD;
|
||||||
@ -1144,6 +1189,11 @@ static int init_globals(void)
|
|||||||
orterun_globals.quiet = false;
|
orterun_globals.quiet = false;
|
||||||
orterun_globals.by_node = false;
|
orterun_globals.by_node = false;
|
||||||
orterun_globals.by_slot = false;
|
orterun_globals.by_slot = false;
|
||||||
|
orterun_globals.by_board = false;
|
||||||
|
orterun_globals.by_socket = false;
|
||||||
|
orterun_globals.bind_to_core = false;
|
||||||
|
orterun_globals.bind_to_board = false;
|
||||||
|
orterun_globals.bind_to_socket = false;
|
||||||
orterun_globals.debugger = false;
|
orterun_globals.debugger = false;
|
||||||
orterun_globals.num_procs = 0;
|
orterun_globals.num_procs = 0;
|
||||||
if( NULL != orterun_globals.env_val )
|
if( NULL != orterun_globals.env_val )
|
||||||
@ -1171,8 +1221,6 @@ static int init_globals(void)
|
|||||||
|
|
||||||
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||||
{
|
{
|
||||||
int id;
|
|
||||||
|
|
||||||
/* print version if requested. Do this before check for help so
|
/* print version if requested. Do this before check for help so
|
||||||
that --version --help works as one might expect. */
|
that --version --help works as one might expect. */
|
||||||
if (orterun_globals.version &&
|
if (orterun_globals.version &&
|
||||||
@ -1237,29 +1285,28 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
|||||||
orte_run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs);
|
orte_run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Allocate and map by node or by slot? Shortcut for setting an
|
/* extract any rank assignment policy directives */
|
||||||
MCA param. */
|
if (orterun_globals.by_node) {
|
||||||
|
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
|
||||||
/* Don't initialize the MCA parameter here unless we have to,
|
} else if (orterun_globals.by_board) {
|
||||||
* since it really should be initialized in rmaps_base_open */
|
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
|
||||||
if (orterun_globals.by_node || orterun_globals.by_slot) {
|
} else if (orterun_globals.by_socket) {
|
||||||
char *policy = NULL;
|
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
|
||||||
id = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
|
} else {
|
||||||
"Scheduling policy for RMAPS. [slot | node]",
|
/* byslot is the default */
|
||||||
false, false, "slot", &policy);
|
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT);
|
||||||
|
|
||||||
if (orterun_globals.by_node) {
|
|
||||||
orterun_globals.by_slot = false;
|
|
||||||
mca_base_param_set_string(id, "node");
|
|
||||||
} else {
|
|
||||||
orterun_globals.by_slot = true;
|
|
||||||
mca_base_param_set_string(id, "slot");
|
|
||||||
}
|
|
||||||
free(policy);
|
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
/* Default */
|
/* extract any binding policy directives - they will
|
||||||
orterun_globals.by_slot = true;
|
* be ignored unless paffinity_alone is set
|
||||||
|
*/
|
||||||
|
if (orterun_globals.bind_to_socket) {
|
||||||
|
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
|
||||||
|
} else if (orterun_globals.bind_to_board) {
|
||||||
|
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
|
||||||
|
} else {
|
||||||
|
/* default to by-core */
|
||||||
|
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
@ -43,6 +43,11 @@ struct orterun_globals_t {
|
|||||||
bool exit;
|
bool exit;
|
||||||
bool by_node;
|
bool by_node;
|
||||||
bool by_slot;
|
bool by_slot;
|
||||||
|
bool by_board;
|
||||||
|
bool by_socket;
|
||||||
|
bool bind_to_core;
|
||||||
|
bool bind_to_board;
|
||||||
|
bool bind_to_socket;
|
||||||
bool debugger;
|
bool debugger;
|
||||||
int num_procs;
|
int num_procs;
|
||||||
char *env_val;
|
char *env_val;
|
||||||
|
@ -93,3 +93,19 @@ The requested number of empty hosts was not available - the system was short by
|
|||||||
|
|
||||||
Please recheck your allocation - further information is available on the
|
Please recheck your allocation - further information is available on the
|
||||||
orte_hosts man page.
|
orte_hosts man page.
|
||||||
|
[boards]
|
||||||
|
Open RTE detected a bad parameter in the hostfile:
|
||||||
|
%s
|
||||||
|
The boards parameter is less than 0:
|
||||||
|
boards=%d
|
||||||
|
[sockets]
|
||||||
|
Open RTE detected a bad parameter in the hostfile:
|
||||||
|
%s
|
||||||
|
The sockets parameter is less than 0:
|
||||||
|
sockets=%d
|
||||||
|
[cores]
|
||||||
|
Open RTE detected a bad parameter in the hostfile:
|
||||||
|
%s
|
||||||
|
The cores parameter is less than 0:
|
||||||
|
cores=%d
|
||||||
|
|
||||||
|
@ -261,6 +261,49 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
|
|||||||
node->username = hostfile_parse_string();
|
node->username = hostfile_parse_string();
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ORTE_HOSTFILE_BOARDS:
|
||||||
|
rc = hostfile_parse_int();
|
||||||
|
if (rc < 0) {
|
||||||
|
orte_show_help("help-hostfile.txt", "boards",
|
||||||
|
true,
|
||||||
|
cur_hostfile_name, rc);
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
node->boards = rc;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_HOSTFILE_SOCKETS_PER_BOARD:
|
||||||
|
rc = hostfile_parse_int();
|
||||||
|
if (rc < 0) {
|
||||||
|
orte_show_help("help-hostfile.txt", "sockets",
|
||||||
|
true,
|
||||||
|
cur_hostfile_name, rc);
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
node->sockets_per_board = rc;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_HOSTFILE_CORES_PER_SOCKET:
|
||||||
|
rc = hostfile_parse_int();
|
||||||
|
if (rc < 0) {
|
||||||
|
orte_show_help("help-hostfile.txt", "cores",
|
||||||
|
true,
|
||||||
|
cur_hostfile_name, rc);
|
||||||
|
OBJ_RELEASE(node);
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
node->cores_per_socket = rc;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_HOSTFILE_CPU_SET:
|
||||||
|
if (NULL != node->cpu_set) {
|
||||||
|
free(node->cpu_set);
|
||||||
|
}
|
||||||
|
node->cpu_set = hostfile_parse_string();
|
||||||
|
break;
|
||||||
|
|
||||||
case ORTE_HOSTFILE_COUNT:
|
case ORTE_HOSTFILE_COUNT:
|
||||||
case ORTE_HOSTFILE_CPU:
|
case ORTE_HOSTFILE_CPU:
|
||||||
case ORTE_HOSTFILE_SLOTS:
|
case ORTE_HOSTFILE_SLOTS:
|
||||||
|
@ -55,22 +55,26 @@ extern orte_hostfile_value_t orte_util_hostfile_value;
|
|||||||
#define YY_NO_UNPUT 1
|
#define YY_NO_UNPUT 1
|
||||||
#define YY_SKIP_YYWRAP 1
|
#define YY_SKIP_YYWRAP 1
|
||||||
|
|
||||||
#define ORTE_HOSTFILE_DONE 0
|
#define ORTE_HOSTFILE_DONE 0
|
||||||
#define ORTE_HOSTFILE_ERROR 1
|
#define ORTE_HOSTFILE_ERROR 1
|
||||||
#define ORTE_HOSTFILE_QUOTED_STRING 2
|
#define ORTE_HOSTFILE_QUOTED_STRING 2
|
||||||
#define ORTE_HOSTFILE_EQUAL 3
|
#define ORTE_HOSTFILE_EQUAL 3
|
||||||
#define ORTE_HOSTFILE_INT 4
|
#define ORTE_HOSTFILE_INT 4
|
||||||
#define ORTE_HOSTFILE_STRING 5
|
#define ORTE_HOSTFILE_STRING 5
|
||||||
#define ORTE_HOSTFILE_CPU 6
|
#define ORTE_HOSTFILE_CPU 6
|
||||||
#define ORTE_HOSTFILE_COUNT 7
|
#define ORTE_HOSTFILE_COUNT 7
|
||||||
#define ORTE_HOSTFILE_SLOTS 8
|
#define ORTE_HOSTFILE_SLOTS 8
|
||||||
#define ORTE_HOSTFILE_SLOTS_MAX 9
|
#define ORTE_HOSTFILE_SLOTS_MAX 9
|
||||||
#define ORTE_HOSTFILE_USERNAME 10
|
#define ORTE_HOSTFILE_USERNAME 10
|
||||||
#define ORTE_HOSTFILE_IPV4 11
|
#define ORTE_HOSTFILE_IPV4 11
|
||||||
#define ORTE_HOSTFILE_HOSTNAME 12
|
#define ORTE_HOSTFILE_HOSTNAME 12
|
||||||
#define ORTE_HOSTFILE_NEWLINE 13
|
#define ORTE_HOSTFILE_NEWLINE 13
|
||||||
#define ORTE_HOSTFILE_IPV6 14
|
#define ORTE_HOSTFILE_IPV6 14
|
||||||
#define ORTE_HOSTFILE_SLOT 15
|
#define ORTE_HOSTFILE_SLOT 15
|
||||||
#define ORTE_HOSTFILE_RELATIVE 16
|
#define ORTE_HOSTFILE_RELATIVE 16
|
||||||
|
#define ORTE_HOSTFILE_BOARDS 17
|
||||||
|
#define ORTE_HOSTFILE_SOCKETS_PER_BOARD 18
|
||||||
|
#define ORTE_HOSTFILE_CORES_PER_SOCKET 19
|
||||||
|
#define ORTE_HOSTFILE_CPU_SET 20
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -120,6 +120,33 @@ username { orte_util_hostfile_value.sval = yytext;
|
|||||||
"user_name" { orte_util_hostfile_value.sval = yytext;
|
"user_name" { orte_util_hostfile_value.sval = yytext;
|
||||||
return ORTE_HOSTFILE_USERNAME; }
|
return ORTE_HOSTFILE_USERNAME; }
|
||||||
|
|
||||||
|
boards { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_BOARDS; }
|
||||||
|
|
||||||
|
sockets { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
|
||||||
|
|
||||||
|
sockets_per_board { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
|
||||||
|
|
||||||
|
"sockets-per-board" { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
|
||||||
|
|
||||||
|
cores { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
|
||||||
|
|
||||||
|
cores_per_socket { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
|
||||||
|
|
||||||
|
"cores-per-socket" { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
|
||||||
|
|
||||||
|
cpu_set { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_CPU_SET; }
|
||||||
|
|
||||||
|
"cpu-set" { orte_util_hostfile_value.sval = yytext;
|
||||||
|
return ORTE_HOSTFILE_CPU_SET; }
|
||||||
|
|
||||||
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
|
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||||
return ORTE_HOSTFILE_RELATIVE; }
|
return ORTE_HOSTFILE_RELATIVE; }
|
||||||
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;
|
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||||
|
@ -47,6 +47,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/odls/odls_types.h"
|
#include "orte/mca/odls/odls_types.h"
|
||||||
#include "orte/mca/rml/base/rml_contact.h"
|
#include "orte/mca/rml/base/rml_contact.h"
|
||||||
|
#include "orte/mca/rmaps/rmaps_types.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
@ -472,20 +473,14 @@ char* orte_regex_encode_maps(orte_job_t *jdata)
|
|||||||
char suffix, sfx;
|
char suffix, sfx;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
|
|
||||||
/* this is only supported with regular maps - i.e., when
|
/* this is only for one app_context */
|
||||||
* the mapping is byslot or bynode. Irregular maps cannot
|
if (jdata->num_apps > 1) {
|
||||||
* be expressed in a regular expression
|
|
||||||
*
|
|
||||||
* Also only supported for one app_context
|
|
||||||
*/
|
|
||||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER ||
|
|
||||||
jdata->num_apps > 1) {
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* determine the mapping policy */
|
/* determine the mapping policy */
|
||||||
byslot = true;
|
byslot = true;
|
||||||
if (jdata->map->policy & ORTE_RMAPS_BYNODE) {
|
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
|
||||||
byslot = false;
|
byslot = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user