Modify the OMPI paffinity and mapping system to support socket-level mapping and binding. Mostly refactors existing code, with modifications to the odls_default module to support the new capabilities.
Adds several new mpirun options: * -bysocket - assign ranks on a node by socket. Effectively load balances the procs assigned to a node across the available sockets. Note that ranks can still be bound to a specific core within the socket, or to the entire socket - the mapping is independent of the binding. * -bind-to-socket - bind each rank to all the cores on the socket to which they are assigned. * -bind-to-core - currently the default behavior (maintained from prior default) * -npersocket N - launch N procs for every socket on a node. Note that this implies we know how many sockets are on a node. Mpirun will determine its local values. These can be overridden by provided values, either via MCA param or in a hostfile Similar features/options are provided at the board level for multi-board nodes. Documentation to follow... This commit was SVN r21791.
Этот коммит содержится в:
родитель
007cbe74f4
Коммит
1dc12046f1
@ -108,6 +108,11 @@
|
||||
#define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU)
|
||||
#define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER)
|
||||
|
||||
/* Process binding modes */
|
||||
#define OPAL_PAFFINITY_DO_NOT_BIND 0x01
|
||||
#define OPAL_PAFFINITY_BIND_TO_CORE 0x02
|
||||
#define OPAL_PAFFINITY_BIND_TO_SOCKET 0x04
|
||||
#define OPAL_PAFFINITY_BIND_TO_BOARD 0x08
|
||||
/* ******************************************************************** */
|
||||
|
||||
|
||||
|
@ -54,6 +54,7 @@
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
|
||||
#include "orte/util/context_fns.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -326,6 +327,24 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the map & binding policy for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the cpus_per_rank for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->cpus_per_rank, 1, OPAL_INT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the stride for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->stride, 1, OPAL_INT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the control flags for this job */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -744,6 +763,24 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the mapping policy for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the cpus/rank for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->cpus_per_rank, &cnt, OPAL_INT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the stride for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stride, &cnt, OPAL_INT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* unpack the control flags for the job */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) {
|
||||
@ -1745,7 +1782,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
}
|
||||
}
|
||||
|
||||
rc = fork_local(app, child, app->env, jobdat->controls, jobdat->stdin_target);
|
||||
rc = fork_local(app, child, app->env, jobdat);
|
||||
/* reaquire lock so we don't double unlock... */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -1791,12 +1828,22 @@ CLEANUP:
|
||||
"%s odls:launch reporting job %s launch status",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job)));
|
||||
/* pack the launch results */
|
||||
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
if (!launch_failed) {
|
||||
/* if the launch failed, we need to flag all the procs from this job
|
||||
* that didn't launch as having failed, or else we will hang
|
||||
*/
|
||||
if (launch_failed) {
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == jobdat->jobid &&
|
||||
ORTE_PROC_STATE_LAUNCHED >= child->state) {
|
||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* if the launch succeeded, check to see if we need to
|
||||
* co-locate any debugger daemons so that they get launched
|
||||
* before we report anything to the HNP. This ensures that
|
||||
@ -1813,11 +1860,14 @@ CLEANUP:
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls) ? "output forwarded" : "no output"));
|
||||
|
||||
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL,
|
||||
orte_odls_globals.debugger->controls, ORTE_VPID_INVALID);
|
||||
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL, orte_odls_globals.debugger);
|
||||
orte_odls_globals.debugger_launched = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* pack the launch results */
|
||||
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
/* if we are the HNP, then we would rather not send this to ourselves -
|
||||
|
@ -107,6 +107,9 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
||||
ptr->launch_msg_processed = false;
|
||||
ptr->apps = NULL;
|
||||
ptr->num_apps = 0;
|
||||
ptr->policy = 0;
|
||||
ptr->cpus_per_rank = 1;
|
||||
ptr->stride = 1;
|
||||
ptr->controls = 0;
|
||||
ptr->stdin_target = ORTE_VPID_INVALID;
|
||||
ptr->total_slots_alloc = 0;
|
||||
@ -232,6 +235,12 @@ int orte_odls_base_open(void)
|
||||
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
|
||||
}
|
||||
|
||||
/* see if the user wants us to report bindings */
|
||||
mca_base_param_reg_int_name("odls", "base_report_bindings",
|
||||
"Report process bindings [default: no]",
|
||||
false, false, (int)false, &i);
|
||||
orte_odls_globals.report_bindings = OPAL_INT_TO_BOOL(i);
|
||||
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
|
@ -64,6 +64,8 @@ typedef struct {
|
||||
opal_list_t xterm_ranks;
|
||||
/* the xterm cmd to be used */
|
||||
char **xtermcmd;
|
||||
/* whether or not to report bindings */
|
||||
bool report_bindings;
|
||||
} orte_odls_globals_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
|
||||
@ -89,8 +91,7 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context,
|
||||
orte_odls_child_t *child,
|
||||
char **environ_copy,
|
||||
orte_job_controls_t controls,
|
||||
orte_vpid_t stdin_target);
|
||||
orte_odls_job_t *jobdat);
|
||||
|
||||
ORTE_DECLSPEC int
|
||||
orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
|
@ -78,6 +78,10 @@ that the specification had improper syntax.
|
||||
An invalid node rank was obtained - this is probably something
|
||||
that should be reported to the OMPI developers.
|
||||
#
|
||||
[odls-default:invalid-local-rank]
|
||||
An invalid local rank was obtained - this is probably something
|
||||
that should be reported to the OMPI developers.
|
||||
#
|
||||
[odls-default:invalid-phys-cpu]
|
||||
An invalid physical processor id was returned when attempting to
|
||||
set processor affinity. This is probably something that should be
|
||||
|
@ -43,6 +43,9 @@ int orte_odls_default_component_query(mca_base_module_t **module, int *priority)
|
||||
extern orte_odls_base_module_t orte_odls_default_module;
|
||||
ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_default_component;
|
||||
|
||||
/* dedicated debug output flag */
|
||||
ORTE_MODULE_DECLSPEC extern bool orte_odls_default_report_bindings;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_ODLS_H */
|
||||
|
@ -35,6 +35,9 @@
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/odls/default/odls_default.h"
|
||||
|
||||
/* instantiate a module-global variable */
|
||||
bool orte_odls_default_report_bindings;
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
@ -66,7 +69,6 @@ orte_odls_base_component_t mca_odls_default_component = {
|
||||
|
||||
int orte_odls_default_component_open(void)
|
||||
{
|
||||
/* nothing to do */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -176,8 +176,7 @@ int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_sta
|
||||
static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
orte_odls_child_t *child,
|
||||
char **environ_copy,
|
||||
orte_job_controls_t controls,
|
||||
orte_vpid_t stdin_target)
|
||||
orte_odls_job_t *jobdat)
|
||||
{
|
||||
orte_iof_base_io_conf_t opts;
|
||||
int rc;
|
||||
@ -185,6 +184,12 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
int i, p[2];
|
||||
pid_t pid;
|
||||
bool paffinity_enabled = false;
|
||||
opal_paffinity_base_cpu_set_t mask;
|
||||
orte_node_rank_t nrank;
|
||||
int16_t n;
|
||||
orte_local_rank_t lrank;
|
||||
int target_socket, npersocket;
|
||||
int logical_cpu, phys_core, phys_cpu;
|
||||
|
||||
if (NULL != child) {
|
||||
/* should pull this information from MPIRUN instead of going with
|
||||
@ -193,7 +198,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
|
||||
/* do we want to setup stdin? */
|
||||
if (NULL != child &&
|
||||
(stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target)) {
|
||||
(jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) {
|
||||
opts.connect_stdin = true;
|
||||
} else {
|
||||
opts.connect_stdin = false;
|
||||
@ -291,39 +296,144 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
/* Otherwise, if opal_paffinity_alone was set, use that scheme */
|
||||
else if (opal_paffinity_alone) {
|
||||
opal_paffinity_base_cpu_set_t mask;
|
||||
int phys_cpu;
|
||||
orte_node_rank_t nrank;
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
"%s odls:default:fork setting paffinity for child %s",
|
||||
/* Otherwise, if opal_paffinity_alone was set and a binding is specified, use that scheme */
|
||||
else if (opal_paffinity_alone && !(ORTE_BIND_TO_NONE & jobdat->policy)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:default:fork setting paffinity for child %s using policy %04x",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:invalid-node-rank", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
ORTE_NAME_PRINT(child->name), jobdat->policy));
|
||||
if (ORTE_BIND_TO_CORE & jobdat->policy) {
|
||||
/* we want to bind this proc to a specific core, or multiple cores
|
||||
* if the cpus_per_rank is > 0
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:default:fork binding child %s to core(s) cpus/rank %d stride %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
(int)jobdat->cpus_per_rank, (int)jobdat->stride));
|
||||
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:invalid-node-rank", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||
/* my starting core has to be offset by cpus_per_rank */
|
||||
logical_cpu = nrank * jobdat->cpus_per_rank;
|
||||
for (n=0; n < jobdat->cpus_per_rank; n++) {
|
||||
phys_cpu = opal_paffinity_base_get_physical_processor_id(logical_cpu);
|
||||
if (0 > phys_cpu) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:invalid-phys-cpu", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||
logical_cpu += jobdat->stride;
|
||||
}
|
||||
if (orte_odls_globals.report_bindings) {
|
||||
opal_output(0, "%s odls:default:fork binding child %s to cpus %04lx",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), mask.bitmask[0]);
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:failed-set-paff", true);
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
paffinity_enabled = true;
|
||||
} else if (ORTE_BIND_TO_SOCKET & jobdat->policy) {
|
||||
/* bind this proc to a socket */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:default:fork binding child %s to socket",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
/* layout this process across the sockets based on
|
||||
* the provided mapping policy
|
||||
*/
|
||||
if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:invalid-local-rank", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
|
||||
/* we need to balance the children from this job across the sockets */
|
||||
npersocket = jobdat->num_local_procs / orte_default_num_sockets_per_board;
|
||||
if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % npersocket);
|
||||
} else {
|
||||
target_socket = opal_paffinity_base_get_physical_socket_id(lrank / npersocket);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:default:fork npersocket %d target socket %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
npersocket, target_socket));
|
||||
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
|
||||
/* this corresponds to a mapping policy where
|
||||
* local rank 0 goes on socket 0, and local
|
||||
* rank 1 goes on socket 1, etc. - round robin
|
||||
* until all ranks are mapped
|
||||
*
|
||||
* NOTE: we already know our number of sockets
|
||||
* from when we initialized
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"bysocket lrank %d numsocks %d logical socket %d", (int)lrank,
|
||||
(int)orte_default_num_sockets_per_board,
|
||||
(int)(lrank % orte_default_num_sockets_per_board)));
|
||||
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_default_num_sockets_per_board);
|
||||
} else {
|
||||
/* use a byslot-like policy where local rank 0 goes on
|
||||
* socket 0, and local rank 1 goes on socket 0, etc.
|
||||
* following round-robing until all ranks mapped
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"byslot lrank %d numsocks %d logical socket %d", (int)lrank,
|
||||
(int)orte_default_num_sockets_per_board,
|
||||
(int)(lrank / orte_default_num_cores_per_socket)));
|
||||
target_socket = opal_paffinity_base_get_physical_socket_id(lrank / orte_default_num_cores_per_socket);
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||
for (n=0; n < orte_default_num_cores_per_socket; n++) {
|
||||
phys_core = opal_paffinity_base_get_physical_core_id(target_socket, n);
|
||||
if (0 > phys_core) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:invalid-phys-cpu", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:invalid-phys-cpu", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
target_socket, n, phys_cpu));
|
||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||
}
|
||||
if (orte_odls_globals.report_bindings) {
|
||||
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %04lx",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), target_socket, mask.bitmask[0]);
|
||||
}
|
||||
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:failed-set-paff", true);
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
paffinity_enabled = true;
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_ZERO(mask);
|
||||
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
|
||||
if (0 > phys_cpu) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:invalid-phys-cpu", true);
|
||||
rc = ORTE_ERR_FATAL;
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
|
||||
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
|
||||
orte_show_help("help-odls-default.txt",
|
||||
"odls-default:failed-set-paff", true);
|
||||
write(p[1], &rc, sizeof(int));
|
||||
exit(1);
|
||||
}
|
||||
paffinity_enabled = true;
|
||||
}
|
||||
/* If we were able to set processor affinity, try setting up
|
||||
* memory affinity
|
||||
@ -335,15 +445,15 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
|
||||
/* tie stdin/out/err/internal to /dev/null */
|
||||
int fdnull;
|
||||
for (i=0; i < 3; i++) {
|
||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||
if(fdnull > i) {
|
||||
dup2(fdnull, i);
|
||||
}
|
||||
close(fdnull);
|
||||
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||
/* tie stdin/out/err/internal to /dev/null */
|
||||
int fdnull;
|
||||
for (i=0; i < 3; i++) {
|
||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||
if(fdnull > i) {
|
||||
dup2(fdnull, i);
|
||||
}
|
||||
close(fdnull);
|
||||
}
|
||||
fdnull = open("/dev/null", O_RDONLY, 0);
|
||||
if(fdnull > opts.p_internal[1]) {
|
||||
@ -396,7 +506,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
exit(1);
|
||||
} else {
|
||||
|
||||
if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
|
||||
if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||
/* connect endpoints IOF */
|
||||
rc = orte_iof_base_setup_parent(child->name, &opts);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
@ -447,7 +557,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
"%s odls:default:fork got code %d back from child",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));
|
||||
close(p[0]);
|
||||
return i;
|
||||
return ORTE_ERR_FAILED_TO_START;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -101,27 +101,30 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t);
|
||||
* List object to locally store job related info
|
||||
*/
|
||||
typedef struct orte_odls_job_t {
|
||||
opal_list_item_t super; /* required to place this on a list */
|
||||
orte_job_state_t state; /* state of the job */
|
||||
orte_jobid_t jobid; /* jobid for this data */
|
||||
bool launch_msg_processed; /* launch msg has been fully processed */
|
||||
orte_app_context_t **apps; /* app_contexts for this job */
|
||||
orte_std_cntr_t num_apps; /* number of app_contexts */
|
||||
orte_job_controls_t controls; /* control flags for job */
|
||||
orte_vpid_t stdin_target; /* where stdin is to go */
|
||||
orte_std_cntr_t total_slots_alloc;
|
||||
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
|
||||
orte_vpid_t num_procs;
|
||||
int32_t num_local_procs;
|
||||
char *regexp; /* the regular expression describing the job */
|
||||
opal_byte_object_t *pmap; /* local copy of pidmap byte object */
|
||||
opal_buffer_t collection_bucket;
|
||||
opal_buffer_t local_collection;
|
||||
orte_grpcomm_coll_t collective_type;
|
||||
int32_t num_contributors;
|
||||
int num_participating;
|
||||
int num_collected;
|
||||
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
|
||||
opal_list_item_t super; /* required to place this on a list */
|
||||
orte_job_state_t state; /* state of the job */
|
||||
orte_jobid_t jobid; /* jobid for this data */
|
||||
bool launch_msg_processed; /* launch msg has been fully processed */
|
||||
orte_app_context_t **apps; /* app_contexts for this job */
|
||||
orte_std_cntr_t num_apps; /* number of app_contexts */
|
||||
orte_mapping_policy_t policy; /* mapping policy */
|
||||
int16_t cpus_per_rank; /* number of cpus/rank */
|
||||
int16_t stride; /* step size between cores of multi-core/rank procs */
|
||||
orte_job_controls_t controls; /* control flags for job */
|
||||
orte_vpid_t stdin_target; /* where stdin is to go */
|
||||
orte_std_cntr_t total_slots_alloc;
|
||||
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
|
||||
orte_vpid_t num_procs;
|
||||
int32_t num_local_procs;
|
||||
char *regexp; /* the regular expression describing the job */
|
||||
opal_byte_object_t *pmap; /* local copy of pidmap byte object */
|
||||
opal_buffer_t collection_bucket;
|
||||
opal_buffer_t local_collection;
|
||||
orte_grpcomm_coll_t collective_type;
|
||||
int32_t num_contributors;
|
||||
int num_participating;
|
||||
int num_collected;
|
||||
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
|
||||
} orte_odls_job_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);
|
||||
|
||||
|
@ -95,8 +95,7 @@ static int odls_process_kill_local_procs(opal_pointer_array_t *procs, bool set_s
|
||||
static int odls_process_fork_local_proc(orte_app_context_t* context,
|
||||
orte_odls_child_t *child,
|
||||
char **environ_copy,
|
||||
orte_job_controls_t controls,
|
||||
orte_vpid_t stdin_target)
|
||||
orte_odls_job_t *jobdat)
|
||||
{
|
||||
pid_t pid;
|
||||
orte_iof_base_io_conf_t opts;
|
||||
@ -124,7 +123,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context,
|
||||
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
|
||||
|
||||
/* do we want to setup stdin? */
|
||||
if (stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target) {
|
||||
if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target) {
|
||||
opts.connect_stdin = true;
|
||||
} else {
|
||||
opts.connect_stdin = false;
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
@ -31,6 +31,8 @@
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -56,14 +58,18 @@ typedef struct {
|
||||
opal_list_t available_components;
|
||||
/** selected module */
|
||||
orte_rmaps_base_module_t *active_module;
|
||||
/* user specified mapping policy */
|
||||
uint8_t policy;
|
||||
/** whether or not we allow oversubscription of nodes */
|
||||
bool oversubscribe;
|
||||
/** do we want one ppn if num_procs not specified */
|
||||
bool pernode;
|
||||
/** number of ppn for n_per_node mode */
|
||||
int npernode;
|
||||
/* number of procs/board */
|
||||
int nperboard;
|
||||
/* number of procs/socket */
|
||||
int npersocket;
|
||||
/* cpus per rank */
|
||||
int cpus_per_rank;
|
||||
/* stride */
|
||||
int stride;
|
||||
/* do not allow use of the localhost */
|
||||
bool no_use_local;
|
||||
/* display the map after it is computed */
|
||||
|
@ -123,15 +123,14 @@ opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, ort
|
||||
*/
|
||||
int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item,
|
||||
orte_vpid_t ppn)
|
||||
opal_list_item_t *cur_node_item)
|
||||
{
|
||||
int rc=ORTE_SUCCESS;
|
||||
int i;
|
||||
orte_node_t *node;
|
||||
opal_list_item_t *next;
|
||||
orte_vpid_t num_alloc = 0;
|
||||
int num_slots_to_take;
|
||||
int num_procs_to_assign, num_possible_procs;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
@ -185,21 +184,37 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
* to do so after oversubscribing).
|
||||
*/
|
||||
if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) {
|
||||
num_slots_to_take = (node->slots_alloc == 0) ? 1 : node->slots_alloc;
|
||||
if (0 == node->slots_alloc) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
num_possible_procs = node->slots_alloc / jdata->map->cpus_per_rank;
|
||||
if (0 == num_possible_procs) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
num_procs_to_assign = num_possible_procs;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
num_slots_to_take = node->slots_alloc - node->slots_inuse;
|
||||
num_possible_procs = (node->slots_alloc - node->slots_inuse) / jdata->map->cpus_per_rank;
|
||||
if (0 == num_possible_procs) {
|
||||
num_procs_to_assign = 1;
|
||||
} else {
|
||||
num_procs_to_assign = num_possible_procs;
|
||||
}
|
||||
}
|
||||
|
||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||
* to the num_per_node
|
||||
*/
|
||||
if (jdata->map->pernode) {
|
||||
num_slots_to_take = jdata->map->npernode;
|
||||
if (0 < jdata->map->npernode) {
|
||||
num_procs_to_assign = jdata->map->npernode;
|
||||
}
|
||||
|
||||
for( i = 0; i < num_slots_to_take; ++i) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
||||
node_list, jdata->map->oversubscribe, true))) {
|
||||
for( i = 0; i < num_procs_to_assign; ++i) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
node_list, jdata->map->oversubscribe,
|
||||
true, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
@ -220,8 +235,7 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
}
|
||||
|
||||
/* if we have fully used up this node, then break from the loop */
|
||||
if (ORTE_ERR_NODE_FULLY_USED == rc ||
|
||||
(orte_rmaps_base.loadbalance && node->num_procs >= ppn)) {
|
||||
if (ORTE_ERR_NODE_FULLY_USED == rc) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -231,17 +245,13 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
* node is NOT max'd out
|
||||
*
|
||||
*/
|
||||
if (i < (num_slots_to_take-1) && ORTE_ERR_NODE_FULLY_USED != rc &&
|
||||
(orte_rmaps_base.loadbalance && node->num_procs < ppn)) {
|
||||
if (i < (num_procs_to_assign-1) && ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||
continue;
|
||||
}
|
||||
cur_node_item = next;
|
||||
}
|
||||
|
||||
complete:
|
||||
/* update the starting vpid */
|
||||
vpid_start += num_procs;
|
||||
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = (orte_node_t*)cur_node_item;
|
||||
|
||||
@ -250,7 +260,7 @@ complete:
|
||||
|
||||
int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item)
|
||||
opal_list_item_t *cur_node_item)
|
||||
{
|
||||
int rc = ORTE_SUCCESS;
|
||||
opal_list_item_t *next;
|
||||
@ -297,8 +307,8 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
|
||||
/* Allocate a slot on this node */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
||||
node_list, jdata->map->oversubscribe, true))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
|
||||
node_list, jdata->map->oversubscribe, true, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
|
@ -67,9 +67,12 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* load it with the system defaults */
|
||||
map->policy = orte_rmaps_base.policy;
|
||||
map->pernode = orte_rmaps_base.pernode;
|
||||
map->policy = orte_default_mapping_policy;
|
||||
map->npernode = orte_rmaps_base.npernode;
|
||||
map->nperboard = orte_rmaps_base.nperboard;
|
||||
map->npersocket = orte_rmaps_base.npersocket;
|
||||
map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
|
||||
map->stride = orte_rmaps_base.stride;
|
||||
map->oversubscribe = orte_rmaps_base.oversubscribe;
|
||||
map->display_map = orte_rmaps_base.display_map;
|
||||
/* assign the map object to this job */
|
||||
|
@ -30,7 +30,9 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
|
||||
@ -92,39 +94,74 @@ int orte_rmaps_base_open(void)
|
||||
|
||||
/* Are we scheduling by node or by slot? */
|
||||
param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
|
||||
"Scheduling Policy for RMAPS. [slot | node]",
|
||||
"Scheduling Policy for RMAPS. [slot (default) | socket | board | node]",
|
||||
false, false, "unspec", &policy);
|
||||
|
||||
if (0 == strcmp(policy, "unspec")) {
|
||||
orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */
|
||||
if (0 == strcmp(policy, "socket")) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
|
||||
} else if (0 == strcmp(policy, "board")) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
|
||||
} else if (0 == strcmp(policy, "node")) {
|
||||
orte_rmaps_base.policy = ORTE_RMAPS_BYNODE;
|
||||
} else {
|
||||
orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
|
||||
}
|
||||
/* if nothing was specified, leave it alone - we already set it
|
||||
* in orterun
|
||||
*/
|
||||
|
||||
/* Do we want one ppn if num_procs not specified */
|
||||
/* check for procs/xxx directives */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_pernode",
|
||||
"Launch one ppn as directed",
|
||||
false, false, (int)false, &value);
|
||||
orte_rmaps_base.pernode = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* if pernode is set, we do not allow npernode to also be set - instead
|
||||
* we default the npernode value to 1
|
||||
*/
|
||||
if (orte_rmaps_base.pernode) {
|
||||
if (value) {
|
||||
orte_rmaps_base.npernode = 1;
|
||||
} else {
|
||||
/* Do we want n ppn */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
|
||||
"Launch n procs/node",
|
||||
false, false, 0, &value);
|
||||
orte_rmaps_base.npernode = value;
|
||||
if (0 < orte_rmaps_base.npernode) {
|
||||
orte_rmaps_base.pernode = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* #procs/node */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
|
||||
"Launch n procs/node",
|
||||
false, false, -1, &value);
|
||||
if (0 < value) {
|
||||
orte_rmaps_base.npernode = value;
|
||||
}
|
||||
|
||||
/* #procs/board */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_n_perboard",
|
||||
"Launch n procs/board",
|
||||
false, false, -1, &orte_rmaps_base.nperboard);
|
||||
if (0 < orte_rmaps_base.nperboard) {
|
||||
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
|
||||
}
|
||||
|
||||
/* #procs/socket */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_n_persocket",
|
||||
"Launch n procs/socket",
|
||||
false, false, -1, &orte_rmaps_base.npersocket);
|
||||
if (0 < orte_rmaps_base.npersocket) {
|
||||
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
|
||||
}
|
||||
|
||||
/* Do we want to loadbalance the job */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_loadbalance",
|
||||
"Balance total number of procs across all allocated nodes",
|
||||
false, false, (int)false, &value);
|
||||
orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* #cpus/rank to use */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_rank",
|
||||
"Number of cpus to use for each rank [1-2**15 (default=1)]",
|
||||
false, false, 1, &value);
|
||||
orte_rmaps_base.cpus_per_rank = value;
|
||||
/* if the cpus/rank > 1, then we have to bind to cores */
|
||||
if (1 < orte_rmaps_base.cpus_per_rank) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||
}
|
||||
|
||||
/* stride to use */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_stride",
|
||||
"When binding multiple cores to a rank, the step size to use between cores [1-2**15 (default: 1)]",
|
||||
false, false, 1, &value);
|
||||
orte_rmaps_base.stride = value;
|
||||
|
||||
/* did the user provide a slot list? */
|
||||
param = mca_base_param_reg_string_name("rmaps", "base_slot_list",
|
||||
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]",
|
||||
@ -136,7 +173,7 @@ int orte_rmaps_base_open(void)
|
||||
"If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun",
|
||||
false, false, (int)false, &value);
|
||||
if (value) {
|
||||
orte_rmaps_base.policy |= ORTE_RMAPS_NO_USE_LOCAL;
|
||||
orte_default_mapping_policy |= ORTE_MAPPING_NO_USE_LOCAL;
|
||||
}
|
||||
|
||||
/* Should we oversubscribe or not? */
|
||||
@ -150,16 +187,6 @@ int orte_rmaps_base_open(void)
|
||||
orte_rmaps_base.oversubscribe = true;
|
||||
}
|
||||
|
||||
/* Do we want to loadbalance the job */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_loadbalance",
|
||||
"Balance total number of procs across all allocated nodes",
|
||||
false, false, (int)false, &value);
|
||||
orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value);
|
||||
/* if we are doing npernode or pernode, then we cannot loadbalance */
|
||||
if (orte_rmaps_base.pernode) {
|
||||
orte_rmaps_base.loadbalance = false;
|
||||
}
|
||||
|
||||
/* should we display the map after determining it? */
|
||||
mca_base_param_reg_int_name("rmaps", "base_display_map",
|
||||
"Whether to display the process map after it is computed",
|
||||
|
@ -41,7 +41,7 @@
|
||||
* Query the registry for all nodes allocated to a specified app_context
|
||||
*/
|
||||
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
|
||||
orte_app_context_t *app, uint8_t policy)
|
||||
orte_app_context_t *app, orte_mapping_policy_t policy)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
orte_node_t *node;
|
||||
@ -169,7 +169,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
/* If the "no local" option was set, then remove the local node
|
||||
* from the list
|
||||
*/
|
||||
if (policy & ORTE_RMAPS_NO_USE_LOCAL) {
|
||||
if (policy & ORTE_MAPPING_NO_USE_LOCAL) {
|
||||
/* we don't need to check through the entire list as
|
||||
* the head node - if it is on the list at all - will
|
||||
* always be in the first position
|
||||
@ -267,9 +267,9 @@ PROCESS:
|
||||
* in the mapper
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base: mapping proc %s to node %s",
|
||||
"%s rmaps:base: mapping proc for job %s to node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
ORTE_JOBID_PRINT(proc->name.jobid),
|
||||
(NULL == node->name) ? "NULL" : node->name));
|
||||
|
||||
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
|
||||
@ -289,88 +289,56 @@ PROCESS:
|
||||
*/
|
||||
int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
orte_node_t *current_node,
|
||||
orte_vpid_t vpid,
|
||||
char *slot_list,
|
||||
int32_t cpus_per_rank,
|
||||
orte_std_cntr_t app_idx,
|
||||
opal_list_t *nodes,
|
||||
bool oversubscribe,
|
||||
bool remove_from_list)
|
||||
bool remove_from_list,
|
||||
orte_proc_t **returnproc)
|
||||
{
|
||||
orte_proc_t *proc, *proc_from_job;
|
||||
orte_proc_t *proc;
|
||||
bool oversub;
|
||||
int rc;
|
||||
int n;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:claim_slot: checking for existence of vpid %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(vpid)));
|
||||
|
||||
/* does this proc already exist within the job? */
|
||||
proc = NULL;
|
||||
for (n=0; n < jdata->procs->size; n++) {
|
||||
if (NULL == (proc_from_job = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
|
||||
continue;
|
||||
}
|
||||
if (proc_from_job->name.vpid == vpid) {
|
||||
/* already have it! */
|
||||
proc = proc_from_job;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:claim_slot: found existing proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
|
||||
if (NULL != proc->slot_list) {
|
||||
/* cleanout stale info */
|
||||
free(proc->slot_list);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == proc) {
|
||||
/* need to create mapped_proc object */
|
||||
/* if we were given a proc, just use it */
|
||||
if (NULL != returnproc && NULL != *returnproc) {
|
||||
proc = *returnproc;
|
||||
} else {
|
||||
/* create mapped_proc object */
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
if (NULL == proc) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* create the process name */
|
||||
/* set the jobid */
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->name.vpid = vpid;
|
||||
/* we do not set the vpid here - this will be done
|
||||
* during a second phase
|
||||
*/
|
||||
proc->app_idx = app_idx;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:claim_slot: created new proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name)));
|
||||
/* add this proc to the job's data - we don't have to worry here
|
||||
* about keeping the array left-justified as all vpids
|
||||
* from 0 to num_procs will be filled
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
||||
(int)vpid,
|
||||
(void*)proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(proc);
|
||||
return rc;
|
||||
|
||||
/* provide returned proc, if requested */
|
||||
if (NULL != returnproc) {
|
||||
*returnproc = proc;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RETAIN(current_node); /* maintain accounting on object */
|
||||
|
||||
if ( NULL != slot_list) {
|
||||
proc->slot_list = strdup(slot_list);
|
||||
}
|
||||
proc->node = current_node;
|
||||
proc->nodename = current_node->name;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:claim_slot mapping rank %d in job %s to node %s",
|
||||
"%s rmaps:base:claim_slot mapping proc in job %s to node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
vpid, ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
|
||||
ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
|
||||
|
||||
/* Be sure to demarcate this slot as claimed for the node */
|
||||
current_node->slots_inuse++;
|
||||
/* Be sure to demarcate the slots for this proc as claimed from the node */
|
||||
current_node->slots_inuse += cpus_per_rank;
|
||||
|
||||
/* see if this node is oversubscribed now */
|
||||
if (current_node->slots_inuse > current_node->slots) {
|
||||
@ -415,8 +383,68 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
{
|
||||
orte_job_map_t *map;
|
||||
orte_vpid_t vpid;
|
||||
int i, j;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
int rc;
|
||||
|
||||
int orte_rmaps_base_compute_usage(orte_job_t *jdata)
|
||||
map = jdata->map;
|
||||
|
||||
if (ORTE_MAPPING_BYSLOT & map->policy ||
|
||||
ORTE_MAPPING_BYSOCKET & map->policy ||
|
||||
ORTE_MAPPING_BYBOARD & map->policy) {
|
||||
/* assign the ranks sequentially */
|
||||
vpid = 0;
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
proc->name.vpid = vpid++;
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
||||
proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_MAPPING_BYNODE & map->policy) {
|
||||
/* assign the ranks round-robin across nodes */
|
||||
for (i=0; i < map->nodes->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
|
||||
continue;
|
||||
}
|
||||
vpid = i;
|
||||
for (j=0; j < node->procs->size; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
proc->name.vpid = vpid;
|
||||
vpid += map->num_nodes;
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
||||
proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
int j, k;
|
||||
@ -501,8 +529,8 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata)
|
||||
* we don't, then it would be possible for procs to conflict
|
||||
* when opening static ports, should that be enabled.
|
||||
*/
|
||||
void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode,
|
||||
orte_node_t *newnode, orte_proc_t *newproc)
|
||||
void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
|
||||
orte_node_t *newnode, orte_proc_t *newproc)
|
||||
{
|
||||
int k;
|
||||
orte_node_rank_t node_rank;
|
||||
|
@ -61,7 +61,7 @@ int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node,
|
||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
|
||||
orte_std_cntr_t *total_num_slots,
|
||||
orte_app_context_t *app,
|
||||
uint8_t policy);
|
||||
orte_mapping_policy_t policy);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_get_target_procs(opal_list_t *procs);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_update_node_usage(opal_list_t *nodes);
|
||||
@ -72,17 +72,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_li
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
orte_node_t *current_node,
|
||||
orte_vpid_t vpid,
|
||||
char *slot_list,
|
||||
int32_t stride,
|
||||
orte_std_cntr_t app_idx,
|
||||
opal_list_t *nodes,
|
||||
bool oversubscribe,
|
||||
bool remove_from_list);
|
||||
bool remove_from_list,
|
||||
orte_proc_t **returnproc);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_usage(orte_job_t *jdata);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode,
|
||||
orte_node_t *newnode, orte_proc_t *newproc);
|
||||
ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
|
||||
orte_node_t *newnode, orte_proc_t *newproc);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs);
|
||||
|
||||
@ -93,12 +95,11 @@ ORTE_DECLSPEC opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item,
|
||||
orte_vpid_t ppn);
|
||||
opal_list_item_t *cur_node_item);
|
||||
|
||||
ORTE_DECLSPEC int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
|
||||
opal_list_t *node_list, orte_vpid_t num_procs,
|
||||
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item);
|
||||
opal_list_item_t *cur_node_item);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
45
orte/mca/rmaps/load_balance/Makefile.am
Обычный файл
45
orte/mca/rmaps/load_balance/Makefile.am
Обычный файл
@ -0,0 +1,45 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-rmaps-lb.txt
|
||||
|
||||
sources = \
|
||||
rmaps_lb.c \
|
||||
rmaps_lb.h \
|
||||
rmaps_lb_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_rmaps_load_balance_DSO
|
||||
component_noinst =
|
||||
component_install = mca_rmaps_load_balance.la
|
||||
else
|
||||
component_noinst = libmca_rmaps_load_balance.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_rmaps_load_balance_la_SOURCES = $(sources)
|
||||
mca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_rmaps_load_balance_la_SOURCES =$(sources)
|
||||
libmca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
|
24
orte/mca/rmaps/load_balance/configure.params
Обычный файл
24
orte/mca/rmaps/load_balance/configure.params
Обычный файл
@ -0,0 +1,24 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_CONFIG_FILES="Makefile"
|
53
orte/mca/rmaps/load_balance/help-orte-rmaps-lb.txt
Обычный файл
53
orte/mca/rmaps/load_balance/help-orte-rmaps-lb.txt
Обычный файл
@ -0,0 +1,53 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open RTE's orterun.
|
||||
#
|
||||
[orte-rmaps-rr:alloc-error]
|
||||
There are not enough slots available in the system to satisfy the %d slots
|
||||
that were requested by the application:
|
||||
%s
|
||||
|
||||
Either request fewer slots for your application, or make more slots available
|
||||
for use.
|
||||
[orte-rmaps-rr:multi-apps-and-zero-np]
|
||||
RMAPS found multiple applications to be launched, with
|
||||
at least one that failed to specify the number of processes to execute.
|
||||
When specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
|
||||
[orte-rmaps-rr:per-node-and-too-many-procs]
|
||||
There are not enough nodes in your allocation to satisfy your request to launch
|
||||
%d processes on a per-node basis - only %d nodes were available.
|
||||
|
||||
Either request fewer processes, or obtain a larger allocation.
|
||||
[orte-rmaps-rr:n-per-node-and-too-many-procs]
|
||||
There are not enough nodes in your allocation to satisfy your request to launch
|
||||
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
|
||||
|
||||
Either request fewer processes, or obtain a larger allocation.
|
||||
[orte-rmaps-rr:n-per-node-and-not-enough-slots]
|
||||
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
|
||||
|
||||
Either request fewer processes/node, or obtain a larger allocation.
|
||||
|
||||
[orte-rmaps-rr:no-np-and-user-map]
|
||||
You have specified a rank-to-node/slot mapping, but failed to provide
|
||||
the number of processes to be executed. For some reason, this information
|
||||
could not be obtained from the mapping you provided, so we cannot continue
|
||||
with executing the specified application.
|
430
orte/mca/rmaps/load_balance/rmaps_lb.c
Обычный файл
430
orte/mca/rmaps/load_balance/rmaps_lb.c
Обычный файл
@ -0,0 +1,430 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_lb.h"
|
||||
|
||||
static int switchyard(orte_job_t *jdata);
|
||||
|
||||
orte_rmaps_base_module_t orte_rmaps_load_balance_module = {
|
||||
switchyard
|
||||
};
|
||||
|
||||
/* Local functions */
|
||||
static int npernode(orte_job_t *jdata);
|
||||
static int nperboard(orte_job_t *jdata);
|
||||
static int npersocket(orte_job_t *jdata);
|
||||
static int loadbalance(orte_job_t *jdata);
|
||||
|
||||
static int switchyard(orte_job_t *jdata)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (0 < orte_rmaps_base.npernode) {
|
||||
rc = npernode(jdata);
|
||||
} else if (0 < orte_rmaps_base.nperboard) {
|
||||
rc = nperboard(jdata);
|
||||
} else if (0 < orte_rmaps_base.npersocket) {
|
||||
rc = npersocket(jdata);
|
||||
} else {
|
||||
rc = loadbalance(jdata);
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* place specified #procs on each node, up to the specified total
|
||||
* number of procs (if one was given).
|
||||
*/
|
||||
static int npernode(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int i, j, rc=ORTE_SUCCESS;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_slots;
|
||||
orte_node_t *node;
|
||||
int total_procs, np;
|
||||
|
||||
/* setup the node list */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* loop through the app_contexts */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
/* use the number of procs if one was given */
|
||||
if (0 < app->num_procs) {
|
||||
np = app->num_procs;
|
||||
} else {
|
||||
np = INT_MAX;
|
||||
}
|
||||
total_procs = 0;
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* loop through the list of nodes */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
/* put the specified number of procs on each node */
|
||||
for (j=0; j < orte_rmaps_base.npernode && total_procs < np; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||
j < orte_rmaps_base.npernode-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(node);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
total_procs++;
|
||||
}
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
}
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int nperboard(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int i, j, k, rc=ORTE_SUCCESS;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_slots;
|
||||
orte_node_t *node;
|
||||
int total_procs, np;
|
||||
|
||||
/* setup the node list */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* loop through the app_contexts */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
/* use the number of procs if one was given */
|
||||
if (0 < app->num_procs) {
|
||||
np = app->num_procs;
|
||||
} else {
|
||||
np = INT_MAX;
|
||||
}
|
||||
total_procs = 0;
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* loop through the list of nodes */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
/* loop through the number of boards in this node */
|
||||
for (k=0; k < node->boards && total_procs < np; k++) {
|
||||
/* put the specified number of procs on each board */
|
||||
for (j=0; j < orte_rmaps_base.nperboard && total_procs < np; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||
j < orte_rmaps_base.nperboard-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(node);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
total_procs++;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
}
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int npersocket(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int i, j, k, n, rc=ORTE_SUCCESS;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_std_cntr_t num_slots;
|
||||
orte_node_t *node;
|
||||
int total_procs, np;
|
||||
|
||||
/* setup the node list */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* loop through the app_contexts */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
/* use the number of procs if one was given */
|
||||
if (0 < app->num_procs) {
|
||||
np = app->num_procs;
|
||||
} else {
|
||||
np = INT_MAX;
|
||||
}
|
||||
total_procs = 0;
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* loop through the list of nodes */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
node = (orte_node_t*)item;
|
||||
/* loop through the number of boards in this node */
|
||||
for (k=0; k < node->boards && total_procs < np; k++) {
|
||||
/* loop through the number of sockets/board */
|
||||
for (n=0; n < node->sockets_per_board && total_procs < np; n++) {
|
||||
/* put the specified number of procs on each socket */
|
||||
for (j=0; j < orte_rmaps_base.npersocket && total_procs < np; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||
j < orte_rmaps_base.npersocket-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(node);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
/* track the number of procs */
|
||||
total_procs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(node);
|
||||
}
|
||||
}
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
error:
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create a load balanced mapping for the job by assigning a constant #procs/node, with
|
||||
* leftovers being spread one/node starting from the first node.
|
||||
*/
|
||||
static int loadbalance(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int i, j;
|
||||
opal_list_t node_list;
|
||||
orte_std_cntr_t num_nodes, num_slots;
|
||||
int rc=ORTE_SUCCESS, total_procs;
|
||||
int ppn = 0;
|
||||
opal_list_item_t *item, *start;
|
||||
orte_node_t *node;
|
||||
|
||||
/* setup */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* compute total #procs we are going to add and the total number of nodes available */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
/* get the nodes and #slots available for this app_context */
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->policy))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to the #slots */
|
||||
app->num_procs = num_slots;
|
||||
}
|
||||
num_nodes = opal_list_get_size(&node_list);
|
||||
/* compute the base ppn */
|
||||
ppn = app->num_procs / num_nodes;
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
start = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||
/* loop through the list of nodes until we either assign all the procs
|
||||
* or return to the starting point
|
||||
*/
|
||||
total_procs = 0;
|
||||
item = start;
|
||||
do {
|
||||
node = (orte_node_t*)item;
|
||||
/* put the specified number of procs on each node */
|
||||
for (j=0; j < ppn; j++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
|
||||
* more procs to place, then that is an error
|
||||
*/
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc ||
|
||||
j < ppn-1) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
total_procs++;
|
||||
}
|
||||
/* move to next node */
|
||||
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
|
||||
item = opal_list_get_first(&node_list);
|
||||
}
|
||||
else {
|
||||
item = opal_list_get_next(item);
|
||||
}
|
||||
} while (item != start);
|
||||
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = node;
|
||||
|
||||
/* if we haven't assigned all the procs, then loop through the list
|
||||
* again, assigning 1 per node until all are assigned
|
||||
*/
|
||||
item = start;
|
||||
while (total_procs < app->num_procs) {
|
||||
node = (orte_node_t*)item;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe,
|
||||
false, NULL))) {
|
||||
/* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
total_procs++;
|
||||
/* move to next node */
|
||||
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
|
||||
item = opal_list_get_first(&node_list);
|
||||
}
|
||||
else {
|
||||
item = opal_list_get_next(item);
|
||||
}
|
||||
}
|
||||
/* save the bookmark */
|
||||
jdata->bookmark = node;
|
||||
|
||||
/* cleanup */
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
}
|
||||
/* record the number of procs */
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
error:
|
||||
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
37
orte/mca/rmaps/load_balance/rmaps_lb.h
Обычный файл
37
orte/mca/rmaps/load_balance/rmaps_lb.h
Обычный файл
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Mapping
|
||||
*/
|
||||
#ifndef ORTE_RMAPS_LB_H
|
||||
#define ORTE_RMAPS_LB_H
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_load_balance_component;
|
||||
extern orte_rmaps_base_module_t orte_rmaps_load_balance_module;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
96
orte/mca/rmaps/load_balance/rmaps_lb_component.c
Обычный файл
96
orte/mca/rmaps/load_balance/rmaps_lb_component.c
Обычный файл
@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "rmaps_lb.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_rmaps_lb_open(void);
|
||||
static int orte_rmaps_lb_close(void);
|
||||
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
|
||||
orte_rmaps_base_component_t mca_rmaps_load_balance_component = {
|
||||
{
|
||||
ORTE_RMAPS_BASE_VERSION_2_0_0,
|
||||
|
||||
"load_balance", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_rmaps_lb_open, /* component open */
|
||||
orte_rmaps_lb_close, /* component close */
|
||||
orte_rmaps_lb_query /* component query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_rmaps_lb_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* the RMAPS framework is -only- opened on HNP's,
|
||||
* so no need to check for that here
|
||||
*/
|
||||
|
||||
/* if load balancing, or any nperxxx, was requested, then we must be selected */
|
||||
if (orte_rmaps_base.loadbalance ||
|
||||
0 < orte_rmaps_base.npernode ||
|
||||
0 < orte_rmaps_base.nperboard ||
|
||||
0 < orte_rmaps_base.npersocket) {
|
||||
*priority = 1000; /* must be selected */
|
||||
*module = (mca_base_module_t *)&orte_rmaps_load_balance_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* otherwise, ignore us */
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_rmaps_lb_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -72,6 +72,7 @@ static int map_app_by_node(orte_app_context_t* app,
|
||||
opal_list_item_t *next;
|
||||
orte_node_t *node;
|
||||
orte_std_cntr_t num_alloc = 0;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
@ -118,8 +119,8 @@ static int map_app_by_node(orte_app_context_t* app,
|
||||
/* Allocate a slot on this node */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
/* pass the base slot list in case it was provided */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
@ -130,6 +131,9 @@ static int map_app_by_node(orte_app_context_t* app,
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (NULL != orte_rmaps_base.slot_list) {
|
||||
proc->slot_list = strdup(orte_rmaps_base.slot_list);
|
||||
}
|
||||
++num_alloc;
|
||||
cur_node_item = next;
|
||||
}
|
||||
@ -150,6 +154,7 @@ static int map_app_by_slot(orte_app_context_t* app,
|
||||
orte_std_cntr_t i, num_slots_to_take, num_alloc = 0;
|
||||
orte_node_t *node;
|
||||
opal_list_item_t *next;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* This loop continues until all procs have been mapped or we run
|
||||
out of resources. We determine that we have "run out of
|
||||
@ -211,7 +216,7 @@ static int map_app_by_slot(orte_app_context_t* app,
|
||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||
* to the num_per_node
|
||||
*/
|
||||
if (jdata->map->pernode) {
|
||||
if (0 < jdata->map->npernode) {
|
||||
num_slots_to_take = jdata->map->npernode;
|
||||
}
|
||||
|
||||
@ -223,8 +228,8 @@ static int map_app_by_slot(orte_app_context_t* app,
|
||||
continue;
|
||||
}
|
||||
/* pass the base slot list in case it was provided */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
@ -235,6 +240,9 @@ static int map_app_by_slot(orte_app_context_t* app,
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if (NULL != orte_rmaps_base.slot_list) {
|
||||
proc->slot_list = strdup(orte_rmaps_base.slot_list);
|
||||
}
|
||||
/* Update the rank */
|
||||
++num_alloc;
|
||||
/* track #slots taken */
|
||||
@ -279,6 +287,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
orte_rmaps_rank_file_map_t *rfmap;
|
||||
orte_std_cntr_t slots_per_node, relative_index, tmp_cnt;
|
||||
int rc;
|
||||
orte_proc_t *proc;
|
||||
|
||||
/* convenience def */
|
||||
map = jdata->map;
|
||||
@ -303,7 +312,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* likewise, we only support pernode options for a single app_context */
|
||||
if (map->pernode && 1 < jdata->num_apps) {
|
||||
if (0 < map->npernode && 1 < jdata->num_apps) {
|
||||
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
|
||||
true, jdata->num_apps, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
@ -349,7 +358,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
||||
|
||||
/* we already checked for sanity, so these are okay to just do here */
|
||||
if (map->pernode && map->npernode == 1) {
|
||||
if (map->npernode == 1) {
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the number of nodes
|
||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||
@ -365,7 +374,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else if (map->pernode && map->npernode > 1) {
|
||||
} else if (map->npernode > 1) {
|
||||
/* first, let's check to see if there are enough slots/node to
|
||||
* meet the request - error out if not
|
||||
*/
|
||||
@ -447,8 +456,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rank, rfmap->slot_list,
|
||||
app->idx, &node_list, jdata->map->oversubscribe, true))) {
|
||||
proc = NULL;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
&node_list, jdata->map->oversubscribe, true, &proc))) {
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||
/* if this is a true error and not the node just being
|
||||
* full, then report the error and abort
|
||||
@ -457,6 +467,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
proc->slot_list = strdup(rfmap->slot_list);
|
||||
jdata->num_procs++;
|
||||
}
|
||||
/* update the starting point */
|
||||
@ -517,7 +528,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
/* if no bookmark, then just start at the beginning of the list */
|
||||
cur_node_item = opal_list_get_first(&node_list);
|
||||
}
|
||||
if (map->policy & ORTE_RMAPS_BYNODE) {
|
||||
if (map->policy & ORTE_MAPPING_BYNODE) {
|
||||
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
|
||||
} else {
|
||||
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
|
||||
@ -542,8 +553,14 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
/* update the job's number of procs */
|
||||
jdata->num_procs = total_procs;
|
||||
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save convenience values */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -37,7 +37,6 @@
|
||||
* Local variable
|
||||
*/
|
||||
static opal_list_item_t *cur_node_item = NULL;
|
||||
static orte_vpid_t vpid_start = 0;
|
||||
|
||||
static char *orte_getline(FILE *fp);
|
||||
|
||||
@ -51,24 +50,22 @@ static int rr_map_default(orte_job_t *jdata, orte_app_context_t *app,
|
||||
cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata);
|
||||
|
||||
/* now perform the mapping */
|
||||
if (ORTE_RMAPS_BYNODE & jdata->map->policy) {
|
||||
if (ORTE_MAPPING_BYNODE & jdata->map->policy) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_bynode(jdata, app, node_list,
|
||||
num_procs, vpid_start,
|
||||
cur_node_item))) {
|
||||
num_procs, cur_node_item))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_byslot(jdata, app, node_list,
|
||||
num_procs, vpid_start,
|
||||
cur_node_item, 0))) {
|
||||
num_procs, cur_node_item))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* update the starting vpid */
|
||||
vpid_start += num_procs;
|
||||
/* update number of procs */
|
||||
jdata->num_procs += num_procs;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -123,7 +120,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
float avgload, minload;
|
||||
orte_node_t *node, *nd=NULL, *oldnode;
|
||||
orte_rmaps_res_ftgrp_t *ftgrp, *target;
|
||||
orte_vpid_t totprocs, lowprocs;
|
||||
orte_vpid_t totprocs, lowprocs, num_assigned;
|
||||
FILE *fp;
|
||||
char *ftinput;
|
||||
int grp;
|
||||
@ -275,8 +272,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
nd->name));
|
||||
/* put proc on the found node */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx,
|
||||
NULL, jdata->map->oversubscribe, false))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
|
||||
NULL, jdata->map->oversubscribe, false, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error
|
||||
*/
|
||||
@ -290,7 +287,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
/* update the node and local ranks so static ports can
|
||||
* be properly selected if active
|
||||
*/
|
||||
orte_rmaps_base_update_usage(jdata, oldnode, nd, proc);
|
||||
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
|
||||
continue;
|
||||
}
|
||||
/* if we did find a target, re-map the proc to the lightest loaded
|
||||
@ -313,8 +310,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name));
|
||||
OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */
|
||||
/* put proc on the found node */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx,
|
||||
NULL, jdata->map->oversubscribe, false))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
|
||||
NULL, jdata->map->oversubscribe, false, &proc))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error
|
||||
*/
|
||||
@ -328,7 +325,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
/* update the node and local ranks so static ports can
|
||||
* be properly selected if active
|
||||
*/
|
||||
orte_rmaps_base_update_usage(jdata, oldnode, nd, proc);
|
||||
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
|
||||
}
|
||||
/* define the daemons that we will use for this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) {
|
||||
@ -354,7 +351,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
/* start at the beginning... */
|
||||
vpid_start = 0;
|
||||
jdata->num_procs = 0;
|
||||
map = jdata->map;
|
||||
|
||||
@ -363,6 +359,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
num_assigned = 0;
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
@ -434,7 +431,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: no available fault group - mapping rr",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-vpid_start))) {
|
||||
if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-num_assigned))) {
|
||||
goto error;
|
||||
}
|
||||
goto cleanup;
|
||||
@ -455,8 +452,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
target->ftgrp, nd->name));
|
||||
/* put proc on that node */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, vpid_start, NULL, app->idx,
|
||||
&node_list, jdata->map->oversubscribe, false))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx,
|
||||
&node_list, jdata->map->oversubscribe, false, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error
|
||||
*/
|
||||
@ -466,7 +463,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
/* track number of procs mapped */
|
||||
vpid_start++;
|
||||
num_assigned++;
|
||||
|
||||
/* flag this fault group as used */
|
||||
target->used = true;
|
||||
@ -484,6 +481,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* track number of procs */
|
||||
jdata->num_procs += app->num_procs;
|
||||
/* cleanup the node list - it can differ from one app_context
|
||||
* to another, so we have to get it every time
|
||||
*/
|
||||
@ -493,11 +492,14 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
}
|
||||
|
||||
/* update the number of procs in the job */
|
||||
jdata->num_procs = vpid_start;
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save convenience values */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -25,32 +25,27 @@
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
/*
|
||||
* General MAP types - instanced in runtime/orte_globals_class_instances.h
|
||||
*/
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Define flags indicating the policy used to perform the map
|
||||
*/
|
||||
#define ORTE_RMAPS_NOPOL 0x00
|
||||
#define ORTE_RMAPS_BYNODE 0x01
|
||||
#define ORTE_RMAPS_BYSLOT 0x02
|
||||
#define ORTE_RMAPS_BYUSER 0x04
|
||||
#define ORTE_RMAPS_NO_USE_LOCAL 0x08
|
||||
|
||||
|
||||
/*
|
||||
* Structure that represents the mapping of a job to an
|
||||
* allocated set of resources.
|
||||
*/
|
||||
struct orte_job_map_t {
|
||||
opal_object_t super;
|
||||
/* save the mapping configuration */
|
||||
uint8_t policy;
|
||||
bool pernode;
|
||||
orte_std_cntr_t npernode;
|
||||
/* user-specified mapping params */
|
||||
orte_mapping_policy_t policy;
|
||||
int npernode;
|
||||
int nperboard;
|
||||
int npersocket;
|
||||
int16_t cpus_per_rank;
|
||||
int16_t stride;
|
||||
bool oversubscribe;
|
||||
bool display_map;
|
||||
bool cpu_lists;
|
||||
|
@ -48,56 +48,13 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
int i;
|
||||
opal_list_t node_list;
|
||||
opal_list_item_t *item;
|
||||
orte_vpid_t vpid_start;
|
||||
orte_std_cntr_t num_nodes, num_slots;
|
||||
int rc;
|
||||
orte_std_cntr_t slots_per_node;
|
||||
int ppn = 0;
|
||||
opal_list_item_t *cur_node_item;
|
||||
|
||||
/* start at the beginning... */
|
||||
vpid_start = 0;
|
||||
jdata->num_procs = 0;
|
||||
|
||||
/* if loadbalancing is requested, then we need to compute
|
||||
* the #procs/node - note that this cannot be done
|
||||
* if we are doing pernode or if #procs was not given
|
||||
*/
|
||||
if (orte_rmaps_base.loadbalance && !jdata->map->pernode) {
|
||||
float res;
|
||||
/* compute total #procs we are going to add */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == app->num_procs) {
|
||||
/* can't do it - tell user and quit */
|
||||
orte_show_help("help-orte-rmaps-rr.txt",
|
||||
"orte-rmaps-rr:loadbalance-and-zero-np",
|
||||
true);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
ppn += app->num_procs;
|
||||
}
|
||||
/* get the total avail nodes and the number
|
||||
* of procs already using them
|
||||
*/
|
||||
num_nodes=0;
|
||||
for (i=0; i < orte_node_pool->size; i++) {
|
||||
if (NULL == opal_pointer_array_get_item(orte_node_pool, i)) {
|
||||
continue;
|
||||
}
|
||||
num_nodes++;
|
||||
}
|
||||
/* compute the balance */
|
||||
res = ((float)ppn / num_nodes);
|
||||
ppn = ppn / num_nodes;
|
||||
if (0 < (res-ppn)) {
|
||||
ppn++;
|
||||
}
|
||||
}
|
||||
|
||||
/* cycle through the app_contexts, mapping them sequentially */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
@ -130,83 +87,22 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||
|
||||
if (jdata->map->pernode && jdata->map->npernode == 1) {
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the number of nodes
|
||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||
* (c) if -np was provided AND #procs <= #nodes, then launch
|
||||
* the specified #procs one/node. In this case, we just
|
||||
* leave app->num_procs alone
|
||||
*/
|
||||
if (0 == app->num_procs) {
|
||||
app->num_procs = num_nodes;
|
||||
} else if (app->num_procs > num_nodes) {
|
||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs",
|
||||
true, app->num_procs, num_nodes, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else if (jdata->map->pernode && jdata->map->npernode > 1) {
|
||||
/* first, let's check to see if there are enough slots/node to
|
||||
* meet the request - error out if not
|
||||
*/
|
||||
slots_per_node = num_slots / num_nodes;
|
||||
if (jdata->map->npernode > slots_per_node) {
|
||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-not-enough-slots",
|
||||
true, jdata->map->npernode, slots_per_node, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the n/node * #nodes
|
||||
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
|
||||
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
|
||||
* the specified #procs n/node. In this case, we just
|
||||
* leave app->num_procs alone
|
||||
*/
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to equal the specified num/node * the number of nodes */
|
||||
app->num_procs = jdata->map->npernode * num_nodes;
|
||||
} else if (app->num_procs > (jdata->map->npernode * num_nodes)) {
|
||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-too-many-procs",
|
||||
true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else if (0 == app->num_procs) {
|
||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER) {
|
||||
/* we can't handle this - it should have been set when we got
|
||||
* the map info. If it wasn't, then we can only error out
|
||||
*/
|
||||
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-np-and-user-map",
|
||||
true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes */
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to equal the number of slots on these mapped nodes */
|
||||
app->num_procs = num_slots;
|
||||
}
|
||||
|
||||
/** track the total number of processes we mapped */
|
||||
/* track the total number of processes we mapped */
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
/* Make assignments */
|
||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER) {
|
||||
rc = ORTE_ERR_NOT_IMPLEMENTED;
|
||||
goto error;
|
||||
} else if (jdata->map->policy & ORTE_RMAPS_BYNODE) {
|
||||
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
|
||||
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list,
|
||||
app->num_procs, vpid_start,
|
||||
cur_node_item);
|
||||
app->num_procs, cur_node_item);
|
||||
} else {
|
||||
rc = orte_rmaps_base_map_byslot(jdata, app, &node_list,
|
||||
app->num_procs, vpid_start,
|
||||
cur_node_item, ppn);
|
||||
app->num_procs, cur_node_item);
|
||||
}
|
||||
|
||||
/* update the starting vpid for the next app_context */
|
||||
vpid_start += app->num_procs;
|
||||
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
@ -221,8 +117,14 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
}
|
||||
|
||||
/* compute and save convenience values */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
||||
/* compute vpids and add proc objects to the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -59,13 +59,14 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
orte_job_map_t *map;
|
||||
orte_app_context_t *app;
|
||||
orte_std_cntr_t i, j;
|
||||
opal_list_item_t *item, *next, *cur_node_item;
|
||||
orte_node_t *node, *nd;
|
||||
opal_list_item_t *item;
|
||||
orte_node_t *node, *nd, *save;
|
||||
orte_vpid_t vpid;
|
||||
orte_std_cntr_t num_nodes;
|
||||
int rc;
|
||||
opal_list_t *default_node_list=NULL;
|
||||
opal_list_t *node_list=NULL;
|
||||
orte_proc_t *proc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:seq mapping job %s",
|
||||
@ -87,6 +88,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
/* start at the beginning... */
|
||||
vpid = 0;
|
||||
jdata->num_procs = 0;
|
||||
if (NULL != default_node_list) {
|
||||
save = (orte_node_t*)opal_list_get_first(default_node_list);
|
||||
}
|
||||
|
||||
/* cycle through the app_contexts, mapping them sequentially */
|
||||
for(i=0; i < jdata->num_apps; i++) {
|
||||
@ -103,12 +107,14 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
nd = (orte_node_t*)opal_list_get_first(node_list);
|
||||
} else {
|
||||
node_list = default_node_list;
|
||||
nd = save;
|
||||
}
|
||||
|
||||
/* check for nolocal and remove the head node, if required */
|
||||
if (map->policy & ORTE_RMAPS_NO_USE_LOCAL) {
|
||||
if (map->policy & ORTE_MAPPING_NO_USE_LOCAL) {
|
||||
for (item = opal_list_get_first(node_list);
|
||||
item != opal_list_get_end(node_list);
|
||||
item = opal_list_get_next(item) ) {
|
||||
@ -132,43 +138,17 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata);
|
||||
|
||||
/* if num_procs wasn't specified, set it now */
|
||||
if (0 == app->num_procs) {
|
||||
app->num_procs = num_nodes;
|
||||
}
|
||||
|
||||
for (i=0; i < app->num_procs; i++) {
|
||||
/* see if any nodes remain unused and available. We need to do this check
|
||||
* each time since we may remove nodes from the list (as they become fully
|
||||
* used) as we cycle through the loop
|
||||
*/
|
||||
if(0 >= opal_list_get_size(node_list) ) {
|
||||
/* Everything is at max usage! :( */
|
||||
orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* Save the next node we can use before claiming slots, since
|
||||
* we may need to prune the nodes list removing overused nodes.
|
||||
* Wrap around to beginning if we are at the end of the list
|
||||
*/
|
||||
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
|
||||
next = opal_list_get_first(node_list);
|
||||
}
|
||||
else {
|
||||
next = opal_list_get_next(cur_node_item);
|
||||
}
|
||||
|
||||
/* find this node on the global array - this is necessary so
|
||||
* that our mapping gets saved on that array as the objects
|
||||
* returned by the hostfile function are -not- on the array
|
||||
*/
|
||||
node = NULL;
|
||||
nd = (orte_node_t*)cur_node_item;
|
||||
for (j=0; j < orte_node_pool->size; j++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
|
||||
continue;
|
||||
@ -186,42 +166,46 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* assign next vpid to this node - do NOT allow claim_slot to remove
|
||||
/* assign proc to this node - do NOT allow claim_slot to remove
|
||||
* an oversubscribed node from the list!
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
|
||||
vpid, NULL, app->idx,
|
||||
jdata->map->cpus_per_rank, app->idx,
|
||||
node_list,
|
||||
jdata->map->oversubscribe,
|
||||
false))) {
|
||||
false, &proc))) {
|
||||
if (ORTE_ERR_NODE_FULLY_USED != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
/* increment the vpid */
|
||||
vpid++;
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = vpid++;
|
||||
/* add to the jdata proc array */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
/* move to next node */
|
||||
cur_node_item = next;
|
||||
nd = (orte_node_t*)opal_list_get_next((opal_list_item_t*)nd);
|
||||
}
|
||||
|
||||
/** track the total number of processes we mapped */
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
/* update the bookmark */
|
||||
jdata->bookmark = (orte_node_t*)cur_node_item;
|
||||
|
||||
/* cleanup the node list if it came from this app_context */
|
||||
if (node_list != default_node_list) {
|
||||
while(NULL != (item = opal_list_remove_first(node_list))) {
|
||||
while (NULL != (item = opal_list_remove_first(node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_RELEASE(node_list);
|
||||
} else {
|
||||
save = nd;
|
||||
}
|
||||
}
|
||||
|
||||
/* compute and save convenience values */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
||||
/* compute and save local ranks */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -110,8 +110,8 @@ static int map_app_by_node(
|
||||
|
||||
/* Allocate a slot on this node */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
@ -212,13 +212,13 @@ static int map_app_by_slot(
|
||||
/* check if we are in npernode mode - if so, then set the num_slots_to_take
|
||||
* to the num_per_node
|
||||
*/
|
||||
if (jdata->map->pernode) {
|
||||
if (0 < jdata->map->npernode) {
|
||||
num_slots_to_take = jdata->map->npernode;
|
||||
}
|
||||
|
||||
for( i = 0; i < num_slots_to_take; ++i) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true, NULL))) {
|
||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||
* really isn't an error - we just need to break from the loop
|
||||
* since the node is fully used up. For now, just don't report
|
||||
@ -426,7 +426,7 @@ static int topo_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
proceed:
|
||||
if (map->pernode && map->npernode == 1) {
|
||||
if (map->npernode == 1) {
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the number of nodes
|
||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||
@ -442,7 +442,7 @@ static int topo_map(orte_job_t *jdata)
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
} else if (map->pernode && map->npernode > 1) {
|
||||
} else if (map->npernode > 1) {
|
||||
/* first, let's check to see if there are enough slots/node to
|
||||
* meet the request - error out if not
|
||||
*/
|
||||
@ -473,11 +473,11 @@ static int topo_map(orte_job_t *jdata)
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes - if
|
||||
user has specified "-bynode", then set it to the number of nodes
|
||||
*/
|
||||
if (map->policy & ORTE_RMAPS_BYNODE) {
|
||||
if (map->policy & ORTE_MAPPING_BYNODE) {
|
||||
app->num_procs = num_nodes;
|
||||
} else if (map->policy & ORTE_RMAPS_BYSLOT) {
|
||||
} else if (map->policy & ORTE_MAPPING_BYSLOT) {
|
||||
app->num_procs = num_slots;
|
||||
} else if (map->policy & ORTE_RMAPS_BYUSER) {
|
||||
} else {
|
||||
/* we can't handle this - it should have been set when we got
|
||||
* the map info. If it wasn't, then we can only error out
|
||||
*/
|
||||
@ -492,10 +492,7 @@ static int topo_map(orte_job_t *jdata)
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
/* Make assignments */
|
||||
if (map->policy == ORTE_RMAPS_BYUSER) {
|
||||
rc = ORTE_ERR_NOT_IMPLEMENTED;
|
||||
goto error;
|
||||
} else if (map->policy == ORTE_RMAPS_BYNODE) {
|
||||
if (map->policy == ORTE_MAPPING_BYNODE) {
|
||||
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
|
||||
} else {
|
||||
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
|
||||
@ -522,7 +519,7 @@ static int topo_map(orte_job_t *jdata)
|
||||
}
|
||||
|
||||
/* compute and save convenience values */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -280,7 +280,6 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
|
||||
|
||||
/* copy data into it */
|
||||
(*dest)->policy = src->policy;
|
||||
(*dest)->pernode = src->pernode;
|
||||
(*dest)->npernode = src->npernode;
|
||||
(*dest)->oversubscribe = src->oversubscribe;
|
||||
(*dest)->display_map = src->display_map;
|
||||
|
@ -407,6 +407,15 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not pack the local board, socket, and core info */
|
||||
|
||||
/* pack the cpu set info */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
|
||||
(void*)(&(nodes[i]->cpu_set)), 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not pack the username */
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
@ -814,13 +823,7 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
|
||||
|
||||
for (i=0; i < num_vals; i++) {
|
||||
/* pack the policy used to generate it */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the pernode flag */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->pernode), 1, OPAL_BOOL))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -362,6 +362,11 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
||||
}
|
||||
}
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tNum boards: %ld\tNum sockets/board: %ld\tNum cores/socket: %ld", tmp, pfx2,
|
||||
(long)src->boards, (long)src->sockets_per_board, (long)src->cores_per_socket);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
if (NULL == src->daemon) {
|
||||
asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2,
|
||||
"Not defined", src->daemon_launched ? "True" : "False");
|
||||
@ -377,8 +382,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
|
||||
(long)src->slots_alloc, (long)src->slots_max);
|
||||
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld:\tCpu set: %s", tmp, pfx2,
|
||||
(long)src->slots_alloc, (long)src->slots_max,
|
||||
(NULL == src->cpu_set) ? "NULL" : src->cpu_set);
|
||||
free(tmp);
|
||||
tmp = tmp2;
|
||||
|
||||
@ -644,9 +650,8 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
|
||||
asprintf(&pfx, "%s\t", pfx2);
|
||||
|
||||
if (orte_devel_level_output) {
|
||||
asprintf(&tmp, "\n%sMap generated by mapping policy: %x\n%s\tPernode: %s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
|
||||
pfx2, src->policy, pfx2,
|
||||
(src->pernode) ? "TRUE" : "FALSE", (long)src->npernode,
|
||||
asprintf(&tmp, "\n%sMap generated by mapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
|
||||
pfx2, src->policy, pfx2, (long)src->npernode,
|
||||
(src->oversubscribe) ? "TRUE" : "FALSE",
|
||||
(src->cpu_lists) ? "TRUE" : "FALSE");
|
||||
|
||||
|
@ -422,6 +422,16 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not unpack the board, socket, and core info */
|
||||
|
||||
/* unpack the cpu set */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(nodes[i]->cpu_set), &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* do not unpack the username */
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
@ -883,15 +893,7 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
|
||||
/* unpack the policy */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(maps[i]->policy), &n, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the pernode flag */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
|
||||
&(maps[i]->pernode), &n, OPAL_BOOL))) {
|
||||
&(maps[i]->policy), &n, ORTE_MAPPING_POLICY))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -27,6 +27,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
@ -132,6 +133,17 @@ bool orte_orted_exit_with_barrier = true;
|
||||
/* report launch progress */
|
||||
bool orte_report_launch_progress = false;
|
||||
|
||||
/* cluster hardware info */
|
||||
uint8_t orte_default_num_boards;
|
||||
uint8_t orte_default_num_sockets_per_board;
|
||||
uint8_t orte_default_num_cores_per_socket;
|
||||
|
||||
/* allocation specification */
|
||||
char *orte_default_cpu_set;
|
||||
|
||||
/* default rank assigment and binding policy */
|
||||
orte_mapping_policy_t orte_default_mapping_policy = 0;
|
||||
|
||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||
|
||||
int orte_debug_output = -1;
|
||||
@ -670,6 +682,16 @@ static void orte_node_construct(orte_node_t* node)
|
||||
node->slots_inuse = 0;
|
||||
node->slots_alloc = 0;
|
||||
node->slots_max = 0;
|
||||
|
||||
node->boards = orte_default_num_boards;
|
||||
node->sockets_per_board = orte_default_num_sockets_per_board;
|
||||
node->cores_per_socket = orte_default_num_cores_per_socket;
|
||||
if (NULL != orte_default_cpu_set) {
|
||||
node->cpu_set = strdup(orte_default_cpu_set);
|
||||
} else {
|
||||
node->cpu_set = NULL;
|
||||
}
|
||||
|
||||
node->username = NULL;
|
||||
}
|
||||
|
||||
@ -702,6 +724,10 @@ static void orte_node_destruct(orte_node_t* node)
|
||||
}
|
||||
OBJ_RELEASE(node->procs);
|
||||
|
||||
if (NULL != node->cpu_set) {
|
||||
free(node->cpu_set);
|
||||
node->cpu_set = NULL;
|
||||
}
|
||||
if (NULL != node->username) {
|
||||
free(node->username);
|
||||
node->username = NULL;
|
||||
@ -871,9 +897,12 @@ OBJ_CLASS_INSTANCE(orte_jmap_t,
|
||||
|
||||
static void orte_job_map_construct(orte_job_map_t* map)
|
||||
{
|
||||
map->policy = ORTE_RMAPS_BYSLOT; /* default to byslot mapping as per orterun options */
|
||||
map->pernode = false;
|
||||
map->policy = 0;
|
||||
map->npernode = 0;
|
||||
map->nperboard = 0;
|
||||
map->npersocket = 0;
|
||||
map->cpus_per_rank = 1;
|
||||
map->stride = 1;
|
||||
map->oversubscribe = true; /* default to allowing oversubscribe */
|
||||
map->display_map = false;
|
||||
map->cpu_lists = false;
|
||||
|
@ -38,7 +38,6 @@
|
||||
#include "opal/class/opal_value_array.h"
|
||||
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
@ -141,6 +140,7 @@ typedef struct orte_job_t orte_job_t;
|
||||
* defining it - resolves potential circular definition
|
||||
*/
|
||||
struct orte_proc_t;
|
||||
struct orte_job_map_t;
|
||||
/************/
|
||||
|
||||
/**
|
||||
@ -241,6 +241,14 @@ typedef struct {
|
||||
specified limit. For example, if we have two processors, we
|
||||
may want to allow up to four processes but no more. */
|
||||
orte_std_cntr_t slots_max;
|
||||
/* number of physical boards in the node - defaults to 1 */
|
||||
uint8_t boards;
|
||||
/* number of sockets on each board - defaults to 1 */
|
||||
uint8_t sockets_per_board;
|
||||
/* number of cores per socket - defaults to 1 */
|
||||
uint8_t cores_per_socket;
|
||||
/* cpus on this node that are assigned for our use */
|
||||
char *cpu_set;
|
||||
/** Username on this node, if specified */
|
||||
char *username;
|
||||
} orte_node_t;
|
||||
@ -258,6 +266,31 @@ typedef uint8_t orte_job_controls_t;
|
||||
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
|
||||
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
|
||||
|
||||
typedef uint16_t orte_mapping_policy_t;
|
||||
#define ORTE_MAPPING_POLICY OPAL_UINT16
|
||||
/* put the rank assignment method in the upper 8 bits */
|
||||
#define ORTE_MAPPING_NOPOL 0x0100
|
||||
#define ORTE_MAPPING_BYNODE 0x0200
|
||||
#define ORTE_MAPPING_BYSLOT 0x0400
|
||||
#define ORTE_MAPPING_BYSOCKET 0x0800
|
||||
#define ORTE_MAPPING_BYBOARD 0x1000
|
||||
#define ORTE_MAPPING_NO_USE_LOCAL 0x2000
|
||||
#define ORTE_MAPPING_NPERXXX 0x4000
|
||||
/* nice macro for setting these */
|
||||
#define ORTE_SET_MAPPING_POLICY(pol) \
|
||||
orte_default_mapping_policy = (orte_default_mapping_policy & 0x00ff) | (pol);
|
||||
#define ORTE_ADD_MAPPING_POLICY(pol) \
|
||||
orte_default_mapping_policy |= (pol);
|
||||
|
||||
/* put the binding policy in the lower 8 bits, using the paffinity values */
|
||||
#define ORTE_BIND_TO_NONE (uint16_t)OPAL_PAFFINITY_DO_NOT_BIND
|
||||
#define ORTE_BIND_TO_CORE (uint16_t)OPAL_PAFFINITY_BIND_TO_CORE
|
||||
#define ORTE_BIND_TO_SOCKET (uint16_t)OPAL_PAFFINITY_BIND_TO_SOCKET
|
||||
#define ORTE_BIND_TO_BOARD (uint16_t)OPAL_PAFFINITY_BIND_TO_BOARD
|
||||
/* nice macro for setting these */
|
||||
#define ORTE_SET_BINDING_POLICY(pol) \
|
||||
orte_default_mapping_policy = (orte_default_mapping_policy & 0xff00) | (pol);
|
||||
|
||||
/* error manager callback function */
|
||||
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
|
||||
|
||||
@ -285,7 +318,7 @@ typedef struct {
|
||||
/* array of pointers to procs in this job */
|
||||
opal_pointer_array_t *procs;
|
||||
/* map of the job */
|
||||
orte_job_map_t *map;
|
||||
struct orte_job_map_t *map;
|
||||
/* bookmark for where we are in mapping - this
|
||||
* indicates the node where we stopped
|
||||
*/
|
||||
@ -531,6 +564,17 @@ ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
|
||||
/* whether or not to report launch progress */
|
||||
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
||||
|
||||
/* cluster hardware info */
|
||||
ORTE_DECLSPEC extern uint8_t orte_default_num_boards;
|
||||
ORTE_DECLSPEC extern uint8_t orte_default_num_sockets_per_board;
|
||||
ORTE_DECLSPEC extern uint8_t orte_default_num_cores_per_socket;
|
||||
|
||||
/* allocation specification */
|
||||
ORTE_DECLSPEC extern char *orte_default_cpu_set;
|
||||
|
||||
/* default rank assigment and binding policy */
|
||||
ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
@ -38,6 +39,7 @@
|
||||
int orte_register_params(void)
|
||||
{
|
||||
int value, tmp;
|
||||
char *strval;
|
||||
|
||||
mca_base_param_reg_int_name("orte", "base_help_aggregate",
|
||||
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
|
||||
@ -297,6 +299,48 @@ int orte_register_params(void)
|
||||
orte_startup_timeout = 2000; /* default to 2 seconds */
|
||||
}
|
||||
}
|
||||
|
||||
/* cluster hardware info */
|
||||
mca_base_param_reg_int_name("orte", "num_boards",
|
||||
"Number of processor boards/node (1-256) [default: 1]",
|
||||
false, false, 1, &value);
|
||||
orte_default_num_boards = (uint8_t)value;
|
||||
if (OPAL_SUCCESS != opal_paffinity_base_get_socket_info(&value)) {
|
||||
value = 1;
|
||||
}
|
||||
mca_base_param_reg_int_name("orte", "num_sockets",
|
||||
"Number of sockets/board (1-256) [default: auto-sensed by mpirun or 1]",
|
||||
false, false, value, &value);
|
||||
orte_default_num_sockets_per_board = (uint8_t)value;
|
||||
if (OPAL_SUCCESS != opal_paffinity_base_get_core_info(0, &value)) {
|
||||
value = 1;
|
||||
}
|
||||
mca_base_param_reg_int_name("orte", "num_cores",
|
||||
"Number of cores/socket (1-256) [default: auto-sensed by mpirun or 1]",
|
||||
false, false, value, &value);
|
||||
orte_default_num_cores_per_socket = (uint8_t)value;
|
||||
|
||||
/* cpu allocation specification */
|
||||
mca_base_param_reg_string_name("orte", "cpu_set",
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
|
||||
false, false, NULL, &orte_default_cpu_set);
|
||||
|
||||
/* binding specification - this will be overridden by any cmd line directive, and
|
||||
* ignored unless opal_paffinity_alone is set
|
||||
*/
|
||||
mca_base_param_reg_string_name("orte", "process_binding",
|
||||
"Policy for binding processes [core | socket | board (default: none)]",
|
||||
false, false, NULL, &strval);
|
||||
if (NULL != strval) {
|
||||
if (0 == strcmp(strval, "socket")) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
|
||||
} else if (0 == strcmp(strval, "board")) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
|
||||
} else if (0 == strcmp(strval, "core")) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -120,6 +120,7 @@
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -512,7 +513,6 @@ static void check_debugger(int fd, short event, void *arg)
|
||||
* one debugger daemon on each node
|
||||
*/
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
jdata->map->pernode = true;
|
||||
jdata->map->npernode = 1;
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/paffinity/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/basename.h"
|
||||
@ -255,10 +256,16 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/* Mapping options */
|
||||
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
|
||||
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to allocate/map processes round-robin by node" },
|
||||
"Whether to assign processes round-robin by node" },
|
||||
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
|
||||
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to allocate/map processes round-robin by slot (the default)" },
|
||||
"Whether to assign processes round-robin by slot (the default)" },
|
||||
{ NULL, NULL, NULL, '\0', "bysocket", "bysocket", 0,
|
||||
&orterun_globals.by_socket, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to assign processes round-robin by socket" },
|
||||
{ NULL, NULL, NULL, '\0', "byboard", "byboard", 0,
|
||||
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to assign processes round-robin by board (equivalent to bynode if only 1 board/node)" },
|
||||
{ "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
|
||||
@ -286,6 +293,29 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Do not run any MPI applications on the local node" },
|
||||
{ "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of cpus to use for each rank [default=1]" },
|
||||
{ "rmaps", "base", "n_perboard", '\0', "nperboard", "nperboard", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per board on all allocated nodes" },
|
||||
{ "rmaps", "base", "n_persocket", '\0', "npersocket", "npersocket", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per socket on all allocated nodes" },
|
||||
|
||||
/* binding options */
|
||||
{ NULL, NULL, NULL, '\0', "bind-to-core", "bind-to-core", 0,
|
||||
&orterun_globals.bind_to_core, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to bind processes to specific cores (the default)" },
|
||||
{ NULL, NULL, NULL, '\0', "bind-to-board", "bind-to-board", 0,
|
||||
&orterun_globals.bind_to_board, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to bind processes to specific boards (meaningless on 1 board/node)" },
|
||||
{ NULL, NULL, NULL, '\0', "bind-to-socket", "bind-to-socket", 0,
|
||||
&orterun_globals.bind_to_socket, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Whether to bind processes to sockets" },
|
||||
{ "rmaps", "base", "stride", '\0', "stride", "stride", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"When binding multiple cores to a rank, the step size to use between cores [default: 1]" },
|
||||
|
||||
/* Allocation options */
|
||||
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
|
||||
@ -294,6 +324,20 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
|
||||
{ "orte", "cpu", "set", '\0', "cpu-set", "cpu-set", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"},
|
||||
|
||||
/* cluster hardware info */
|
||||
{ "orte", "num", "boards", '\0', "num-boards", "num-boards", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of processor boards/node (1-256) [default: 1]"},
|
||||
{ "orte", "num", "sockets", '\0', "num-sockets", "num-sockets", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of sockets/board (1-256) [default: 1]"},
|
||||
{ "orte", "num", "cores", '\0', "num-cores", "num-cores", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Number of cores/socket (1-256) [default: 1]"},
|
||||
|
||||
/* mpiexec-like arguments */
|
||||
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
|
||||
@ -468,6 +512,7 @@ int orterun(int argc, char *argv[])
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* check what user wants us to do with stdin */
|
||||
if (0 == strcmp(orterun_globals.stdin_target, "all")) {
|
||||
jdata->stdin_target = ORTE_VPID_WILDCARD;
|
||||
@ -1144,6 +1189,11 @@ static int init_globals(void)
|
||||
orterun_globals.quiet = false;
|
||||
orterun_globals.by_node = false;
|
||||
orterun_globals.by_slot = false;
|
||||
orterun_globals.by_board = false;
|
||||
orterun_globals.by_socket = false;
|
||||
orterun_globals.bind_to_core = false;
|
||||
orterun_globals.bind_to_board = false;
|
||||
orterun_globals.bind_to_socket = false;
|
||||
orterun_globals.debugger = false;
|
||||
orterun_globals.num_procs = 0;
|
||||
if( NULL != orterun_globals.env_val )
|
||||
@ -1171,8 +1221,6 @@ static int init_globals(void)
|
||||
|
||||
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
{
|
||||
int id;
|
||||
|
||||
/* print version if requested. Do this before check for help so
|
||||
that --version --help works as one might expect. */
|
||||
if (orterun_globals.version &&
|
||||
@ -1237,29 +1285,28 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
orte_run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs);
|
||||
}
|
||||
|
||||
/* Allocate and map by node or by slot? Shortcut for setting an
|
||||
MCA param. */
|
||||
|
||||
/* Don't initialize the MCA parameter here unless we have to,
|
||||
* since it really should be initialized in rmaps_base_open */
|
||||
if (orterun_globals.by_node || orterun_globals.by_slot) {
|
||||
char *policy = NULL;
|
||||
id = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
|
||||
"Scheduling policy for RMAPS. [slot | node]",
|
||||
false, false, "slot", &policy);
|
||||
|
||||
if (orterun_globals.by_node) {
|
||||
orterun_globals.by_slot = false;
|
||||
mca_base_param_set_string(id, "node");
|
||||
} else {
|
||||
orterun_globals.by_slot = true;
|
||||
mca_base_param_set_string(id, "slot");
|
||||
}
|
||||
free(policy);
|
||||
/* extract any rank assignment policy directives */
|
||||
if (orterun_globals.by_node) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
|
||||
} else if (orterun_globals.by_board) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
|
||||
} else if (orterun_globals.by_socket) {
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
|
||||
} else {
|
||||
/* byslot is the default */
|
||||
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT);
|
||||
}
|
||||
else {
|
||||
/* Default */
|
||||
orterun_globals.by_slot = true;
|
||||
|
||||
/* extract any binding policy directives - they will
|
||||
* be ignored unless paffinity_alone is set
|
||||
*/
|
||||
if (orterun_globals.bind_to_socket) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
|
||||
} else if (orterun_globals.bind_to_board) {
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
|
||||
} else {
|
||||
/* default to by-core */
|
||||
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -43,6 +43,11 @@ struct orterun_globals_t {
|
||||
bool exit;
|
||||
bool by_node;
|
||||
bool by_slot;
|
||||
bool by_board;
|
||||
bool by_socket;
|
||||
bool bind_to_core;
|
||||
bool bind_to_board;
|
||||
bool bind_to_socket;
|
||||
bool debugger;
|
||||
int num_procs;
|
||||
char *env_val;
|
||||
|
@ -93,3 +93,19 @@ The requested number of empty hosts was not available - the system was short by
|
||||
|
||||
Please recheck your allocation - further information is available on the
|
||||
orte_hosts man page.
|
||||
[boards]
|
||||
Open RTE detected a bad parameter in the hostfile:
|
||||
%s
|
||||
The boards parameter is less than 0:
|
||||
boards=%d
|
||||
[sockets]
|
||||
Open RTE detected a bad parameter in the hostfile:
|
||||
%s
|
||||
The sockets parameter is less than 0:
|
||||
sockets=%d
|
||||
[cores]
|
||||
Open RTE detected a bad parameter in the hostfile:
|
||||
%s
|
||||
The cores parameter is less than 0:
|
||||
cores=%d
|
||||
|
||||
|
@ -261,6 +261,49 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
|
||||
node->username = hostfile_parse_string();
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_BOARDS:
|
||||
rc = hostfile_parse_int();
|
||||
if (rc < 0) {
|
||||
orte_show_help("help-hostfile.txt", "boards",
|
||||
true,
|
||||
cur_hostfile_name, rc);
|
||||
OBJ_RELEASE(node);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
node->boards = rc;
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_SOCKETS_PER_BOARD:
|
||||
rc = hostfile_parse_int();
|
||||
if (rc < 0) {
|
||||
orte_show_help("help-hostfile.txt", "sockets",
|
||||
true,
|
||||
cur_hostfile_name, rc);
|
||||
OBJ_RELEASE(node);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
node->sockets_per_board = rc;
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_CORES_PER_SOCKET:
|
||||
rc = hostfile_parse_int();
|
||||
if (rc < 0) {
|
||||
orte_show_help("help-hostfile.txt", "cores",
|
||||
true,
|
||||
cur_hostfile_name, rc);
|
||||
OBJ_RELEASE(node);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
node->cores_per_socket = rc;
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_CPU_SET:
|
||||
if (NULL != node->cpu_set) {
|
||||
free(node->cpu_set);
|
||||
}
|
||||
node->cpu_set = hostfile_parse_string();
|
||||
break;
|
||||
|
||||
case ORTE_HOSTFILE_COUNT:
|
||||
case ORTE_HOSTFILE_CPU:
|
||||
case ORTE_HOSTFILE_SLOTS:
|
||||
|
@ -55,22 +55,26 @@ extern orte_hostfile_value_t orte_util_hostfile_value;
|
||||
#define YY_NO_UNPUT 1
|
||||
#define YY_SKIP_YYWRAP 1
|
||||
|
||||
#define ORTE_HOSTFILE_DONE 0
|
||||
#define ORTE_HOSTFILE_ERROR 1
|
||||
#define ORTE_HOSTFILE_QUOTED_STRING 2
|
||||
#define ORTE_HOSTFILE_EQUAL 3
|
||||
#define ORTE_HOSTFILE_INT 4
|
||||
#define ORTE_HOSTFILE_STRING 5
|
||||
#define ORTE_HOSTFILE_CPU 6
|
||||
#define ORTE_HOSTFILE_COUNT 7
|
||||
#define ORTE_HOSTFILE_SLOTS 8
|
||||
#define ORTE_HOSTFILE_SLOTS_MAX 9
|
||||
#define ORTE_HOSTFILE_USERNAME 10
|
||||
#define ORTE_HOSTFILE_IPV4 11
|
||||
#define ORTE_HOSTFILE_HOSTNAME 12
|
||||
#define ORTE_HOSTFILE_NEWLINE 13
|
||||
#define ORTE_HOSTFILE_IPV6 14
|
||||
#define ORTE_HOSTFILE_SLOT 15
|
||||
#define ORTE_HOSTFILE_RELATIVE 16
|
||||
#define ORTE_HOSTFILE_DONE 0
|
||||
#define ORTE_HOSTFILE_ERROR 1
|
||||
#define ORTE_HOSTFILE_QUOTED_STRING 2
|
||||
#define ORTE_HOSTFILE_EQUAL 3
|
||||
#define ORTE_HOSTFILE_INT 4
|
||||
#define ORTE_HOSTFILE_STRING 5
|
||||
#define ORTE_HOSTFILE_CPU 6
|
||||
#define ORTE_HOSTFILE_COUNT 7
|
||||
#define ORTE_HOSTFILE_SLOTS 8
|
||||
#define ORTE_HOSTFILE_SLOTS_MAX 9
|
||||
#define ORTE_HOSTFILE_USERNAME 10
|
||||
#define ORTE_HOSTFILE_IPV4 11
|
||||
#define ORTE_HOSTFILE_HOSTNAME 12
|
||||
#define ORTE_HOSTFILE_NEWLINE 13
|
||||
#define ORTE_HOSTFILE_IPV6 14
|
||||
#define ORTE_HOSTFILE_SLOT 15
|
||||
#define ORTE_HOSTFILE_RELATIVE 16
|
||||
#define ORTE_HOSTFILE_BOARDS 17
|
||||
#define ORTE_HOSTFILE_SOCKETS_PER_BOARD 18
|
||||
#define ORTE_HOSTFILE_CORES_PER_SOCKET 19
|
||||
#define ORTE_HOSTFILE_CPU_SET 20
|
||||
|
||||
#endif
|
||||
|
@ -120,6 +120,33 @@ username { orte_util_hostfile_value.sval = yytext;
|
||||
"user_name" { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_USERNAME; }
|
||||
|
||||
boards { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_BOARDS; }
|
||||
|
||||
sockets { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
|
||||
|
||||
sockets_per_board { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
|
||||
|
||||
"sockets-per-board" { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
|
||||
|
||||
cores { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
|
||||
|
||||
cores_per_socket { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
|
||||
|
||||
"cores-per-socket" { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
|
||||
|
||||
cpu_set { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CPU_SET; }
|
||||
|
||||
"cpu-set" { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_CPU_SET; }
|
||||
|
||||
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||
return ORTE_HOSTFILE_RELATIVE; }
|
||||
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;
|
||||
|
@ -47,6 +47,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
@ -472,20 +473,14 @@ char* orte_regex_encode_maps(orte_job_t *jdata)
|
||||
char suffix, sfx;
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* this is only supported with regular maps - i.e., when
|
||||
* the mapping is byslot or bynode. Irregular maps cannot
|
||||
* be expressed in a regular expression
|
||||
*
|
||||
* Also only supported for one app_context
|
||||
*/
|
||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER ||
|
||||
jdata->num_apps > 1) {
|
||||
/* this is only for one app_context */
|
||||
if (jdata->num_apps > 1) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* determine the mapping policy */
|
||||
byslot = true;
|
||||
if (jdata->map->policy & ORTE_RMAPS_BYNODE) {
|
||||
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
|
||||
byslot = false;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user