1
1

Modify the OMPI paffinity and mapping system to support socket-level mapping and binding. Mostly refactors existing code, with modifications to the odls_default module to support the new capabilities.

Adds several new mpirun options:

* -bysocket - assign ranks on a node by socket. Effectively load balances the procs assigned to a node across the available sockets. Note that ranks can still be bound to a specific core within the socket, or to the entire socket - the mapping is independent of the binding.

* -bind-to-socket - bind each rank to all the cores on the socket to which they are assigned.

* -bind-to-core - currently the default behavior (maintained from prior default)

* -npersocket N - launch N procs for every socket on a node. Note that this implies we know how many sockets are on a node. Mpirun will determine its local values. These can be overridden by provided values, either via MCA param or in a hostfile

Similar features/options are provided at the board level for multi-board nodes.

Documentation to follow...

This commit was SVN r21791.
Этот коммит содержится в:
Ralph Castain 2009-08-11 02:51:27 +00:00
родитель 007cbe74f4
Коммит 1dc12046f1
44 изменённых файлов: 1621 добавлений и 514 удалений

Просмотреть файл

@ -108,6 +108,11 @@
#define OPAL_PROC_ON_LOCAL_CU(n) ((n) & OPAL_PROC_ON_CU)
#define OPAL_PROC_ON_LOCAL_CLUSTER(n) ((n) & OPAL_PROC_ON_CLUSTER)
/* Process binding modes */
#define OPAL_PAFFINITY_DO_NOT_BIND 0x01
#define OPAL_PAFFINITY_BIND_TO_CORE 0x02
#define OPAL_PAFFINITY_BIND_TO_SOCKET 0x04
#define OPAL_PAFFINITY_BIND_TO_BOARD 0x08
/* ******************************************************************** */

Просмотреть файл

@ -54,6 +54,7 @@
#include "orte/mca/ess/base/base.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/util/context_fns.h"
#include "orte/util/name_fns.h"
@ -326,6 +327,24 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc;
}
/* pack the map & binding policy for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->policy, 1, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the cpus_per_rank for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->cpus_per_rank, 1, OPAL_INT16))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the stride for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &map->stride, 1, OPAL_INT16))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the control flags for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, ORTE_JOB_CONTROL))) {
ORTE_ERROR_LOG(rc);
@ -744,6 +763,24 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the mapping policy for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->policy, &cnt, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the cpus/rank for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->cpus_per_rank, &cnt, OPAL_INT16))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the stride for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->stride, &cnt, OPAL_INT16))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the control flags for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, ORTE_JOB_CONTROL))) {
@ -1745,7 +1782,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
}
}
rc = fork_local(app, child, app->env, jobdat->controls, jobdat->stdin_target);
rc = fork_local(app, child, app->env, jobdat);
/* reaquire lock so we don't double unlock... */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
if (ORTE_SUCCESS != rc) {
@ -1791,12 +1828,22 @@ CLEANUP:
"%s odls:launch reporting job %s launch status",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job)));
/* pack the launch results */
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
ORTE_ERROR_LOG(ret);
}
if (!launch_failed) {
/* if the launch failed, we need to flag all the procs from this job
* that didn't launch as having failed, or else we will hang
*/
if (launch_failed) {
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == jobdat->jobid &&
ORTE_PROC_STATE_LAUNCHED >= child->state) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
}
}
} else {
/* if the launch succeeded, check to see if we need to
* co-locate any debugger daemons so that they get launched
* before we report anything to the HNP. This ensures that
@ -1813,13 +1860,16 @@ CLEANUP:
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls) ? "output forwarded" : "no output"));
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL,
orte_odls_globals.debugger->controls, ORTE_VPID_INVALID);
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL, orte_odls_globals.debugger);
orte_odls_globals.debugger_launched = true;
}
}
/* pack the launch results */
if (ORTE_SUCCESS != (ret = pack_state_update(&alert, true, jobdat))) {
ORTE_ERROR_LOG(ret);
}
/* if we are the HNP, then we would rather not send this to ourselves -
* instead, we queue it up for local processing
*/

Просмотреть файл

@ -107,6 +107,9 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
ptr->launch_msg_processed = false;
ptr->apps = NULL;
ptr->num_apps = 0;
ptr->policy = 0;
ptr->cpus_per_rank = 1;
ptr->stride = 1;
ptr->controls = 0;
ptr->stdin_target = ORTE_VPID_INVALID;
ptr->total_slots_alloc = 0;
@ -232,6 +235,12 @@ int orte_odls_base_open(void)
opal_argv_append_nosize(&orte_odls_globals.xtermcmd, "-e");
}
/* see if the user wants us to report bindings */
mca_base_param_reg_int_name("odls", "base_report_bindings",
"Report process bindings [default: no]",
false, false, (int)false, &i);
orte_odls_globals.report_bindings = OPAL_INT_TO_BOOL(i);
/* Open up all available components */
if (ORTE_SUCCESS !=

Просмотреть файл

@ -64,6 +64,8 @@ typedef struct {
opal_list_t xterm_ranks;
/* the xterm cmd to be used */
char **xtermcmd;
/* whether or not to report bindings */
bool report_bindings;
} orte_odls_globals_t;
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
@ -89,8 +91,7 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context,
orte_odls_child_t *child,
char **environ_copy,
orte_job_controls_t controls,
orte_vpid_t stdin_target);
orte_odls_job_t *jobdat);
ORTE_DECLSPEC int
orte_odls_base_default_launch_local(orte_jobid_t job,

Просмотреть файл

@ -78,6 +78,10 @@ that the specification had improper syntax.
An invalid node rank was obtained - this is probably something
that should be reported to the OMPI developers.
#
[odls-default:invalid-local-rank]
An invalid local rank was obtained - this is probably something
that should be reported to the OMPI developers.
#
[odls-default:invalid-phys-cpu]
An invalid physical processor id was returned when attempting to
set processor affinity. This is probably something that should be

Просмотреть файл

@ -43,6 +43,9 @@ int orte_odls_default_component_query(mca_base_module_t **module, int *priority)
extern orte_odls_base_module_t orte_odls_default_module;
ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_default_component;
/* dedicated debug output flag */
ORTE_MODULE_DECLSPEC extern bool orte_odls_default_report_bindings;
END_C_DECLS
#endif /* ORTE_ODLS_H */

Просмотреть файл

@ -35,6 +35,9 @@
#include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/odls/default/odls_default.h"
/* instantiate a module-global variable */
bool orte_odls_default_report_bindings;
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
@ -66,7 +69,6 @@ orte_odls_base_component_t mca_odls_default_component = {
int orte_odls_default_component_open(void)
{
/* nothing to do */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -176,8 +176,7 @@ int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs, bool set_sta
static int odls_default_fork_local_proc(orte_app_context_t* context,
orte_odls_child_t *child,
char **environ_copy,
orte_job_controls_t controls,
orte_vpid_t stdin_target)
orte_odls_job_t *jobdat)
{
orte_iof_base_io_conf_t opts;
int rc;
@ -185,7 +184,13 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
int i, p[2];
pid_t pid;
bool paffinity_enabled = false;
opal_paffinity_base_cpu_set_t mask;
orte_node_rank_t nrank;
int16_t n;
orte_local_rank_t lrank;
int target_socket, npersocket;
int logical_cpu, phys_core, phys_cpu;
if (NULL != child) {
/* should pull this information from MPIRUN instead of going with
default */
@ -193,7 +198,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
/* do we want to setup stdin? */
if (NULL != child &&
(stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target)) {
(jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) {
opts.connect_stdin = true;
} else {
opts.connect_stdin = false;
@ -265,7 +270,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
write(p[1], &i, sizeof(int));
exit(1);
}
/* Setup process affinity. First check to see if a slot list was
* specified. If so, use it. If no slot list was specified,
* that's not an error -- just fall through and try the next
@ -291,39 +296,144 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
exit(1);
}
}
/* Otherwise, if opal_paffinity_alone was set, use that scheme */
else if (opal_paffinity_alone) {
opal_paffinity_base_cpu_set_t mask;
int phys_cpu;
orte_node_rank_t nrank;
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork setting paffinity for child %s",
/* Otherwise, if opal_paffinity_alone was set and a binding is specified, use that scheme */
else if (opal_paffinity_alone && !(ORTE_BIND_TO_NONE & jobdat->policy)) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:default:fork setting paffinity for child %s using policy %04x",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-node-rank", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
ORTE_NAME_PRINT(child->name), jobdat->policy));
if (ORTE_BIND_TO_CORE & jobdat->policy) {
/* we want to bind this proc to a specific core, or multiple cores
* if the cpus_per_rank is > 0
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:default:fork binding child %s to core(s) cpus/rank %d stride %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name),
(int)jobdat->cpus_per_rank, (int)jobdat->stride));
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-node-rank", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_PAFFINITY_CPU_ZERO(mask);
/* my starting core has to be offset by cpus_per_rank */
logical_cpu = nrank * jobdat->cpus_per_rank;
for (n=0; n < jobdat->cpus_per_rank; n++) {
phys_cpu = opal_paffinity_base_get_physical_processor_id(logical_cpu);
if (0 > phys_cpu) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
logical_cpu += jobdat->stride;
}
if (orte_odls_globals.report_bindings) {
opal_output(0, "%s odls:default:fork binding child %s to cpus %04lx",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), mask.bitmask[0]);
}
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
orte_show_help("help-odls-default.txt",
"odls-default:failed-set-paff", true);
write(p[1], &rc, sizeof(int));
exit(1);
}
paffinity_enabled = true;
} else if (ORTE_BIND_TO_SOCKET & jobdat->policy) {
/* bind this proc to a socket */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:default:fork binding child %s to socket",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* layout this process across the sockets based on
* the provided mapping policy
*/
if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-local-rank", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
/* we need to balance the children from this job across the sockets */
npersocket = jobdat->num_local_procs / orte_default_num_sockets_per_board;
if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % npersocket);
} else {
target_socket = opal_paffinity_base_get_physical_socket_id(lrank / npersocket);
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:default:fork npersocket %d target socket %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
npersocket, target_socket));
} else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) {
/* this corresponds to a mapping policy where
* local rank 0 goes on socket 0, and local
* rank 1 goes on socket 1, etc. - round robin
* until all ranks are mapped
*
* NOTE: we already know our number of sockets
* from when we initialized
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"bysocket lrank %d numsocks %d logical socket %d", (int)lrank,
(int)orte_default_num_sockets_per_board,
(int)(lrank % orte_default_num_sockets_per_board)));
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_default_num_sockets_per_board);
} else {
/* use a byslot-like policy where local rank 0 goes on
* socket 0, and local rank 1 goes on socket 0, etc.
* following round-robing until all ranks mapped
*/
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"byslot lrank %d numsocks %d logical socket %d", (int)lrank,
(int)orte_default_num_sockets_per_board,
(int)(lrank / orte_default_num_cores_per_socket)));
target_socket = opal_paffinity_base_get_physical_socket_id(lrank / orte_default_num_cores_per_socket);
}
OPAL_PAFFINITY_CPU_ZERO(mask);
for (n=0; n < orte_default_num_cores_per_socket; n++) {
phys_core = opal_paffinity_base_get_physical_core_id(target_socket, n);
if (0 > phys_core) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
target_socket, n, phys_cpu));
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
}
if (orte_odls_globals.report_bindings) {
opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %04lx",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), target_socket, mask.bitmask[0]);
}
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
orte_show_help("help-odls-default.txt",
"odls-default:failed-set-paff", true);
write(p[1], &rc, sizeof(int));
exit(1);
}
paffinity_enabled = true;
}
OPAL_PAFFINITY_CPU_ZERO(mask);
phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
if (0 > phys_cpu) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
rc = ORTE_ERR_FATAL;
write(p[1], &rc, sizeof(int));
exit(1);
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
if (OPAL_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
orte_show_help("help-odls-default.txt",
"odls-default:failed-set-paff", true);
write(p[1], &rc, sizeof(int));
exit(1);
}
paffinity_enabled = true;
}
/* If we were able to set processor affinity, try setting up
* memory affinity
@ -335,15 +445,15 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
}
}
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;
for (i=0; i < 3; i++) {
fdnull = open("/dev/null", O_RDONLY, 0);
if(fdnull > i) {
dup2(fdnull, i);
}
close(fdnull);
} else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;
for (i=0; i < 3; i++) {
fdnull = open("/dev/null", O_RDONLY, 0);
if(fdnull > i) {
dup2(fdnull, i);
}
close(fdnull);
}
fdnull = open("/dev/null", O_RDONLY, 0);
if(fdnull > opts.p_internal[1]) {
@ -396,7 +506,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
exit(1);
} else {
if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & controls)) {
if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
/* connect endpoints IOF */
rc = orte_iof_base_setup_parent(child->name, &opts);
if(ORTE_SUCCESS != rc) {
@ -447,7 +557,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
"%s odls:default:fork got code %d back from child",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));
close(p[0]);
return i;
return ORTE_ERR_FAILED_TO_START;
}
}

Просмотреть файл

@ -101,27 +101,30 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t);
* List object to locally store job related info
*/
typedef struct orte_odls_job_t {
opal_list_item_t super; /* required to place this on a list */
orte_job_state_t state; /* state of the job */
orte_jobid_t jobid; /* jobid for this data */
bool launch_msg_processed; /* launch msg has been fully processed */
orte_app_context_t **apps; /* app_contexts for this job */
orte_std_cntr_t num_apps; /* number of app_contexts */
orte_job_controls_t controls; /* control flags for job */
orte_vpid_t stdin_target; /* where stdin is to go */
orte_std_cntr_t total_slots_alloc;
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
orte_vpid_t num_procs;
int32_t num_local_procs;
char *regexp; /* the regular expression describing the job */
opal_byte_object_t *pmap; /* local copy of pidmap byte object */
opal_buffer_t collection_bucket;
opal_buffer_t local_collection;
orte_grpcomm_coll_t collective_type;
int32_t num_contributors;
int num_participating;
int num_collected;
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
opal_list_item_t super; /* required to place this on a list */
orte_job_state_t state; /* state of the job */
orte_jobid_t jobid; /* jobid for this data */
bool launch_msg_processed; /* launch msg has been fully processed */
orte_app_context_t **apps; /* app_contexts for this job */
orte_std_cntr_t num_apps; /* number of app_contexts */
orte_mapping_policy_t policy; /* mapping policy */
int16_t cpus_per_rank; /* number of cpus/rank */
int16_t stride; /* step size between cores of multi-core/rank procs */
orte_job_controls_t controls; /* control flags for job */
orte_vpid_t stdin_target; /* where stdin is to go */
orte_std_cntr_t total_slots_alloc;
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
orte_vpid_t num_procs;
int32_t num_local_procs;
char *regexp; /* the regular expression describing the job */
opal_byte_object_t *pmap; /* local copy of pidmap byte object */
opal_buffer_t collection_bucket;
opal_buffer_t local_collection;
orte_grpcomm_coll_t collective_type;
int32_t num_contributors;
int num_participating;
int num_collected;
struct timeval launch_msg_recvd; /* when the launch msg for this job was recvd - for timing purposes only */
} orte_odls_job_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_job_t);

Просмотреть файл

@ -95,8 +95,7 @@ static int odls_process_kill_local_procs(opal_pointer_array_t *procs, bool set_s
static int odls_process_fork_local_proc(orte_app_context_t* context,
orte_odls_child_t *child,
char **environ_copy,
orte_job_controls_t controls,
orte_vpid_t stdin_target)
orte_odls_job_t *jobdat)
{
pid_t pid;
orte_iof_base_io_conf_t opts;
@ -124,7 +123,7 @@ static int odls_process_fork_local_proc(orte_app_context_t* context,
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
/* do we want to setup stdin? */
if (stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == stdin_target) {
if (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target) {
opts.connect_stdin = true;
} else {
opts.connect_stdin = false;

Просмотреть файл

@ -50,6 +50,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_wait.h"

Просмотреть файл

@ -31,6 +31,8 @@
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rmaps/rmaps.h"
BEGIN_C_DECLS
@ -56,14 +58,18 @@ typedef struct {
opal_list_t available_components;
/** selected module */
orte_rmaps_base_module_t *active_module;
/* user specified mapping policy */
uint8_t policy;
/** whether or not we allow oversubscription of nodes */
bool oversubscribe;
/** do we want one ppn if num_procs not specified */
bool pernode;
/** number of ppn for n_per_node mode */
int npernode;
/* number of procs/board */
int nperboard;
/* number of procs/socket */
int npersocket;
/* cpus per rank */
int cpus_per_rank;
/* stride */
int stride;
/* do not allow use of the localhost */
bool no_use_local;
/* display the map after it is computed */

Просмотреть файл

@ -123,15 +123,14 @@ opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, ort
*/
int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item,
orte_vpid_t ppn)
opal_list_item_t *cur_node_item)
{
int rc=ORTE_SUCCESS;
int i;
orte_node_t *node;
opal_list_item_t *next;
orte_vpid_t num_alloc = 0;
int num_slots_to_take;
int num_procs_to_assign, num_possible_procs;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
@ -185,21 +184,37 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
* to do so after oversubscribing).
*/
if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) {
num_slots_to_take = (node->slots_alloc == 0) ? 1 : node->slots_alloc;
if (0 == node->slots_alloc) {
num_procs_to_assign = 1;
} else {
num_possible_procs = node->slots_alloc / jdata->map->cpus_per_rank;
if (0 == num_possible_procs) {
num_procs_to_assign = 1;
} else {
num_procs_to_assign = num_possible_procs;
}
}
} else {
num_slots_to_take = node->slots_alloc - node->slots_inuse;
num_possible_procs = (node->slots_alloc - node->slots_inuse) / jdata->map->cpus_per_rank;
if (0 == num_possible_procs) {
num_procs_to_assign = 1;
} else {
num_procs_to_assign = num_possible_procs;
}
}
/* check if we are in npernode mode - if so, then set the num_slots_to_take
* to the num_per_node
*/
if (jdata->map->pernode) {
num_slots_to_take = jdata->map->npernode;
if (0 < jdata->map->npernode) {
num_procs_to_assign = jdata->map->npernode;
}
for( i = 0; i < num_slots_to_take; ++i) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
node_list, jdata->map->oversubscribe, true))) {
for( i = 0; i < num_procs_to_assign; ++i) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
node_list, jdata->map->oversubscribe,
true, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -220,8 +235,7 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
}
/* if we have fully used up this node, then break from the loop */
if (ORTE_ERR_NODE_FULLY_USED == rc ||
(orte_rmaps_base.loadbalance && node->num_procs >= ppn)) {
if (ORTE_ERR_NODE_FULLY_USED == rc) {
break;
}
}
@ -231,17 +245,13 @@ int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
* node is NOT max'd out
*
*/
if (i < (num_slots_to_take-1) && ORTE_ERR_NODE_FULLY_USED != rc &&
(orte_rmaps_base.loadbalance && node->num_procs < ppn)) {
if (i < (num_procs_to_assign-1) && ORTE_ERR_NODE_FULLY_USED != rc) {
continue;
}
cur_node_item = next;
}
complete:
/* update the starting vpid */
vpid_start += num_procs;
complete:
/* save the bookmark */
jdata->bookmark = (orte_node_t*)cur_node_item;
@ -250,7 +260,7 @@ complete:
int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item)
opal_list_item_t *cur_node_item)
{
int rc = ORTE_SUCCESS;
opal_list_item_t *next;
@ -297,8 +307,8 @@ int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
node_list, jdata->map->oversubscribe, true))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
node_list, jdata->map->oversubscribe, true, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report

Просмотреть файл

@ -67,9 +67,12 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* load it with the system defaults */
map->policy = orte_rmaps_base.policy;
map->pernode = orte_rmaps_base.pernode;
map->policy = orte_default_mapping_policy;
map->npernode = orte_rmaps_base.npernode;
map->nperboard = orte_rmaps_base.nperboard;
map->npersocket = orte_rmaps_base.npersocket;
map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
map->stride = orte_rmaps_base.stride;
map->oversubscribe = orte_rmaps_base.oversubscribe;
map->display_map = orte_rmaps_base.display_map;
/* assign the map object to this job */

Просмотреть файл

@ -30,7 +30,9 @@
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/paffinity/paffinity.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
@ -92,39 +94,74 @@ int orte_rmaps_base_open(void)
/* Are we scheduling by node or by slot? */
param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
"Scheduling Policy for RMAPS. [slot | node]",
"Scheduling Policy for RMAPS. [slot (default) | socket | board | node]",
false, false, "unspec", &policy);
if (0 == strcmp(policy, "unspec")) {
orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */
if (0 == strcmp(policy, "socket")) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
} else if (0 == strcmp(policy, "board")) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
} else if (0 == strcmp(policy, "node")) {
orte_rmaps_base.policy = ORTE_RMAPS_BYNODE;
} else {
orte_rmaps_base.policy = ORTE_RMAPS_BYSLOT; /* default to byslot */
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
}
/* if nothing was specified, leave it alone - we already set it
* in orterun
*/
/* Do we want one ppn if num_procs not specified */
/* check for procs/xxx directives */
param = mca_base_param_reg_int_name("rmaps", "base_pernode",
"Launch one ppn as directed",
false, false, (int)false, &value);
orte_rmaps_base.pernode = OPAL_INT_TO_BOOL(value);
/* if pernode is set, we do not allow npernode to also be set - instead
* we default the npernode value to 1
*/
if (orte_rmaps_base.pernode) {
if (value) {
orte_rmaps_base.npernode = 1;
} else {
/* Do we want n ppn */
param = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
"Launch n procs/node",
false, false, 0, &value);
orte_rmaps_base.npernode = value;
if (0 < orte_rmaps_base.npernode) {
orte_rmaps_base.pernode = true;
}
}
/* #procs/node */
param = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
"Launch n procs/node",
false, false, -1, &value);
if (0 < value) {
orte_rmaps_base.npernode = value;
}
/* #procs/board */
param = mca_base_param_reg_int_name("rmaps", "base_n_perboard",
"Launch n procs/board",
false, false, -1, &orte_rmaps_base.nperboard);
if (0 < orte_rmaps_base.nperboard) {
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
}
/* #procs/socket */
param = mca_base_param_reg_int_name("rmaps", "base_n_persocket",
"Launch n procs/socket",
false, false, -1, &orte_rmaps_base.npersocket);
if (0 < orte_rmaps_base.npersocket) {
ORTE_ADD_MAPPING_POLICY(ORTE_MAPPING_NPERXXX);
}
/* Do we want to loadbalance the job */
param = mca_base_param_reg_int_name("rmaps", "base_loadbalance",
"Balance total number of procs across all allocated nodes",
false, false, (int)false, &value);
orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value);
/* #cpus/rank to use */
param = mca_base_param_reg_int_name("rmaps", "base_cpus_per_rank",
"Number of cpus to use for each rank [1-2**15 (default=1)]",
false, false, 1, &value);
orte_rmaps_base.cpus_per_rank = value;
/* if the cpus/rank > 1, then we have to bind to cores */
if (1 < orte_rmaps_base.cpus_per_rank) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
}
/* stride to use */
param = mca_base_param_reg_int_name("rmaps", "base_stride",
"When binding multiple cores to a rank, the step size to use between cores [1-2**15 (default: 1)]",
false, false, 1, &value);
orte_rmaps_base.stride = value;
/* did the user provide a slot list? */
param = mca_base_param_reg_string_name("rmaps", "base_slot_list",
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]",
@ -136,7 +173,7 @@ int orte_rmaps_base_open(void)
"If false, allow scheduling MPI applications on the same node as mpirun (default). If true, do not schedule any MPI applications on the same node as mpirun",
false, false, (int)false, &value);
if (value) {
orte_rmaps_base.policy |= ORTE_RMAPS_NO_USE_LOCAL;
orte_default_mapping_policy |= ORTE_MAPPING_NO_USE_LOCAL;
}
/* Should we oversubscribe or not? */
@ -150,16 +187,6 @@ int orte_rmaps_base_open(void)
orte_rmaps_base.oversubscribe = true;
}
/* Do we want to loadbalance the job */
param = mca_base_param_reg_int_name("rmaps", "base_loadbalance",
"Balance total number of procs across all allocated nodes",
false, false, (int)false, &value);
orte_rmaps_base.loadbalance = OPAL_INT_TO_BOOL(value);
/* if we are doing npernode or pernode, then we cannot loadbalance */
if (orte_rmaps_base.pernode) {
orte_rmaps_base.loadbalance = false;
}
/* should we display the map after determining it? */
mca_base_param_reg_int_name("rmaps", "base_display_map",
"Whether to display the process map after it is computed",

Просмотреть файл

@ -41,7 +41,7 @@
* Query the registry for all nodes allocated to a specified app_context
*/
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
orte_app_context_t *app, uint8_t policy)
orte_app_context_t *app, orte_mapping_policy_t policy)
{
opal_list_item_t *item, *next;
orte_node_t *node;
@ -169,7 +169,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
/* If the "no local" option was set, then remove the local node
* from the list
*/
if (policy & ORTE_RMAPS_NO_USE_LOCAL) {
if (policy & ORTE_MAPPING_NO_USE_LOCAL) {
/* we don't need to check through the entire list as
* the head node - if it is on the list at all - will
* always be in the first position
@ -267,9 +267,9 @@ PROCESS:
* in the mapper
*/
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base: mapping proc %s to node %s",
"%s rmaps:base: mapping proc for job %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name),
ORTE_JOBID_PRINT(proc->name.jobid),
(NULL == node->name) ? "NULL" : node->name));
if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
@ -289,88 +289,56 @@ PROCESS:
*/
int orte_rmaps_base_claim_slot(orte_job_t *jdata,
orte_node_t *current_node,
orte_vpid_t vpid,
char *slot_list,
int32_t cpus_per_rank,
orte_std_cntr_t app_idx,
opal_list_t *nodes,
bool oversubscribe,
bool remove_from_list)
bool remove_from_list,
orte_proc_t **returnproc)
{
orte_proc_t *proc, *proc_from_job;
orte_proc_t *proc;
bool oversub;
int rc;
int n;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:claim_slot: checking for existence of vpid %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(vpid)));
/* does this proc already exist within the job? */
proc = NULL;
for (n=0; n < jdata->procs->size; n++) {
if (NULL == (proc_from_job = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
continue;
}
if (proc_from_job->name.vpid == vpid) {
/* already have it! */
proc = proc_from_job;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:claim_slot: found existing proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
if (NULL != proc->slot_list) {
/* cleanout stale info */
free(proc->slot_list);
}
break;
}
}
if (NULL == proc) {
/* need to create mapped_proc object */
/* if we were given a proc, just use it */
if (NULL != returnproc && NULL != *returnproc) {
proc = *returnproc;
} else {
/* create mapped_proc object */
proc = OBJ_NEW(orte_proc_t);
if (NULL == proc) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* create the process name */
/* set the jobid */
proc->name.jobid = jdata->jobid;
proc->name.vpid = vpid;
/* we do not set the vpid here - this will be done
* during a second phase
*/
proc->app_idx = app_idx;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:claim_slot: created new proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc->name)));
/* add this proc to the job's data - we don't have to worry here
* about keeping the array left-justified as all vpids
* from 0 to num_procs will be filled
*/
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
(int)vpid,
(void*)proc))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(proc);
return rc;
/* provide returned proc, if requested */
if (NULL != returnproc) {
*returnproc = proc;
}
}
OBJ_RETAIN(current_node); /* maintain accounting on object */
if ( NULL != slot_list) {
proc->slot_list = strdup(slot_list);
}
proc->node = current_node;
proc->nodename = current_node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:claim_slot mapping rank %d in job %s to node %s",
"%s rmaps:base:claim_slot mapping proc in job %s to node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
vpid, ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
ORTE_JOBID_PRINT(jdata->jobid), current_node->name));
/* Be sure to demarcate this slot as claimed for the node */
current_node->slots_inuse++;
/* Be sure to demarcate the slots for this proc as claimed from the node */
current_node->slots_inuse += cpus_per_rank;
/* see if this node is oversubscribed now */
if (current_node->slots_inuse > current_node->slots) {
@ -415,8 +383,68 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
return ORTE_SUCCESS;
}
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
{
orte_job_map_t *map;
orte_vpid_t vpid;
int i, j;
orte_node_t *node;
orte_proc_t *proc;
int rc;
map = jdata->map;
if (ORTE_MAPPING_BYSLOT & map->policy ||
ORTE_MAPPING_BYSOCKET & map->policy ||
ORTE_MAPPING_BYBOARD & map->policy) {
/* assign the ranks sequentially */
vpid = 0;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
proc->name.vpid = vpid++;
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
return ORTE_SUCCESS;
}
if (ORTE_MAPPING_BYNODE & map->policy) {
/* assign the ranks round-robin across nodes */
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
vpid = i;
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
continue;
}
proc->name.vpid = vpid;
vpid += map->num_nodes;
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
return ORTE_SUCCESS;
}
int orte_rmaps_base_compute_usage(orte_job_t *jdata)
return ORTE_ERR_NOT_IMPLEMENTED;
}
int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata)
{
orte_std_cntr_t i;
int j, k;
@ -501,8 +529,8 @@ int orte_rmaps_base_compute_usage(orte_job_t *jdata)
* we don't, then it would be possible for procs to conflict
* when opening static ports, should that be enabled.
*/
void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode,
orte_node_t *newnode, orte_proc_t *newproc)
void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
orte_node_t *newnode, orte_proc_t *newproc)
{
int k;
orte_node_rank_t node_rank;

Просмотреть файл

@ -61,7 +61,7 @@ int orte_rmaps_base_add_proc_to_map(orte_job_map_t *map, orte_node_t *node,
ORTE_DECLSPEC int orte_rmaps_base_get_target_nodes(opal_list_t* node_list,
orte_std_cntr_t *total_num_slots,
orte_app_context_t *app,
uint8_t policy);
orte_mapping_policy_t policy);
ORTE_DECLSPEC int orte_rmaps_base_get_target_procs(opal_list_t *procs);
ORTE_DECLSPEC int orte_rmaps_base_update_node_usage(opal_list_t *nodes);
@ -72,17 +72,19 @@ ORTE_DECLSPEC int orte_rmaps_base_get_mapped_targets(opal_list_t *mapped_node_li
ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata,
orte_node_t *current_node,
orte_vpid_t vpid,
char *slot_list,
int32_t stride,
orte_std_cntr_t app_idx,
opal_list_t *nodes,
bool oversubscribe,
bool remove_from_list);
bool remove_from_list,
orte_proc_t **returnproc);
ORTE_DECLSPEC int orte_rmaps_base_compute_usage(orte_job_t *jdata);
ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata);
ORTE_DECLSPEC void orte_rmaps_base_update_usage(orte_job_t *jdata, orte_node_t *oldnode,
orte_node_t *newnode, orte_proc_t *newproc);
ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata);
ORTE_DECLSPEC void orte_rmaps_base_update_local_ranks(orte_job_t *jdata, orte_node_t *oldnode,
orte_node_t *newnode, orte_proc_t *newproc);
ORTE_DECLSPEC int orte_rmaps_base_rearrange_map(orte_app_context_t *app, orte_job_map_t *map, opal_list_t *procs);
@ -93,12 +95,11 @@ ORTE_DECLSPEC opal_list_item_t* orte_rmaps_base_get_starting_point(opal_list_t *
ORTE_DECLSPEC int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item,
orte_vpid_t ppn);
opal_list_item_t *cur_node_item);
ORTE_DECLSPEC int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
opal_list_t *node_list, orte_vpid_t num_procs,
orte_vpid_t vpid_start, opal_list_item_t *cur_node_item);
opal_list_item_t *cur_node_item);
END_C_DECLS

45
orte/mca/rmaps/load_balance/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,45 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
dist_pkgdata_DATA = help-orte-rmaps-lb.txt
sources = \
rmaps_lb.c \
rmaps_lb.h \
rmaps_lb_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_rmaps_load_balance_DSO
component_noinst =
component_install = mca_rmaps_load_balance.la
else
component_noinst = libmca_rmaps_load_balance.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_rmaps_load_balance_la_SOURCES = $(sources)
mca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_rmaps_load_balance_la_SOURCES =$(sources)
libmca_rmaps_load_balance_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -0,0 +1,53 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open RTE's orterun.
#
[orte-rmaps-rr:alloc-error]
There are not enough slots available in the system to satisfy the %d slots
that were requested by the application:
%s
Either request fewer slots for your application, or make more slots available
for use.
[orte-rmaps-rr:multi-apps-and-zero-np]
RMAPS found multiple applications to be launched, with
at least one that failed to specify the number of processes to execute.
When specifying multiple applications, you must specify how many processes
of each to launch via the -np argument.
[orte-rmaps-rr:per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a per-node basis - only %d nodes were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-rr:n-per-node-and-too-many-procs]
There are not enough nodes in your allocation to satisfy your request to launch
%d processes on a %d per-node basis - only %d nodes with a total of %d slots were available.
Either request fewer processes, or obtain a larger allocation.
[orte-rmaps-rr:n-per-node-and-not-enough-slots]
There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available.
Either request fewer processes/node, or obtain a larger allocation.
[orte-rmaps-rr:no-np-and-user-map]
You have specified a rank-to-node/slot mapping, but failed to provide
the number of processes to be executed. For some reason, this information
could not be obtained from the mapping you provided, so we cannot continue
with executing the specified application.

430
orte/mca/rmaps/load_balance/rmaps_lb.c Обычный файл
Просмотреть файл

@ -0,0 +1,430 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_lb.h"
static int switchyard(orte_job_t *jdata);
orte_rmaps_base_module_t orte_rmaps_load_balance_module = {
switchyard
};
/* Local functions */
static int npernode(orte_job_t *jdata);
static int nperboard(orte_job_t *jdata);
static int npersocket(orte_job_t *jdata);
static int loadbalance(orte_job_t *jdata);
static int switchyard(orte_job_t *jdata)
{
int rc;
if (0 < orte_rmaps_base.npernode) {
rc = npernode(jdata);
} else if (0 < orte_rmaps_base.nperboard) {
rc = nperboard(jdata);
} else if (0 < orte_rmaps_base.npersocket) {
rc = npersocket(jdata);
} else {
rc = loadbalance(jdata);
}
if (ORTE_SUCCESS != rc) {
return rc;
}
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
/* place specified #procs on each node, up to the specified total
* number of procs (if one was given).
*/
static int npernode(orte_job_t *jdata)
{
orte_app_context_t *app;
int i, j, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int total_procs, np;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* loop through the app_contexts */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
total_procs = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
for (j=0; j < orte_rmaps_base.npernode && total_procs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != rc ||
j < orte_rmaps_base.npernode-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
total_procs++;
}
OBJ_RELEASE(node);
}
}
jdata->num_procs = total_procs;
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
static int nperboard(orte_job_t *jdata)
{
orte_app_context_t *app;
int i, j, k, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int total_procs, np;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* loop through the app_contexts */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
total_procs = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && total_procs < np; k++) {
/* put the specified number of procs on each board */
for (j=0; j < orte_rmaps_base.nperboard && total_procs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != rc ||
j < orte_rmaps_base.nperboard-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
total_procs++;
}
}
OBJ_RELEASE(node);
}
}
jdata->num_procs = total_procs;
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
static int npersocket(orte_job_t *jdata)
{
orte_app_context_t *app;
int i, j, k, n, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int total_procs, np;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* loop through the app_contexts */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
total_procs = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && total_procs < np; k++) {
/* loop through the number of sockets/board */
for (n=0; n < node->sockets_per_board && total_procs < np; n++) {
/* put the specified number of procs on each socket */
for (j=0; j < orte_rmaps_base.npersocket && total_procs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != rc ||
j < orte_rmaps_base.npersocket-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
/* track the number of procs */
total_procs++;
}
}
}
OBJ_RELEASE(node);
}
}
jdata->num_procs = total_procs;
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
/*
* Create a load balanced mapping for the job by assigning a constant #procs/node, with
* leftovers being spread one/node starting from the first node.
*/
static int loadbalance(orte_job_t *jdata)
{
orte_app_context_t *app;
int i, j;
opal_list_t node_list;
orte_std_cntr_t num_nodes, num_slots;
int rc=ORTE_SUCCESS, total_procs;
int ppn = 0;
opal_list_item_t *item, *start;
orte_node_t *node;
/* setup */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* compute total #procs we are going to add and the total number of nodes available */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* get the nodes and #slots available for this app_context */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
if (0 == app->num_procs) {
/* set the num_procs to the #slots */
app->num_procs = num_slots;
}
num_nodes = opal_list_get_size(&node_list);
/* compute the base ppn */
ppn = app->num_procs / num_nodes;
/* if a bookmark exists from some prior mapping, set us to start there */
start = orte_rmaps_base_get_starting_point(&node_list, jdata);
/* loop through the list of nodes until we either assign all the procs
* or return to the starting point
*/
total_procs = 0;
item = start;
do {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
for (j=0; j < ppn; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != rc ||
j < ppn-1) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
total_procs++;
}
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
item = opal_list_get_first(&node_list);
}
else {
item = opal_list_get_next(item);
}
} while (item != start);
/* save the bookmark */
jdata->bookmark = node;
/* if we haven't assigned all the procs, then loop through the list
* again, assigning 1 per node until all are assigned
*/
item = start;
while (total_procs < app->num_procs) {
node = (orte_node_t*)item;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
if (ORTE_ERR_NODE_FULLY_USED != rc) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
total_procs++;
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
item = opal_list_get_first(&node_list);
}
else {
item = opal_list_get_next(item);
}
}
/* save the bookmark */
jdata->bookmark = node;
/* cleanup */
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
}
/* record the number of procs */
jdata->num_procs = total_procs;
error:
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}

37
orte/mca/rmaps/load_balance/rmaps_lb.h Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Resource Mapping
*/
#ifndef ORTE_RMAPS_LB_H
#define ORTE_RMAPS_LB_H
#include "orte_config.h"
#include "orte/mca/rmaps/rmaps.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_load_balance_component;
extern orte_rmaps_base_module_t orte_rmaps_load_balance_module;
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,96 @@
/*
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_lb.h"
/*
* Local functions
*/
static int orte_rmaps_lb_open(void);
static int orte_rmaps_lb_close(void);
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority);
orte_rmaps_base_component_t mca_rmaps_load_balance_component = {
{
ORTE_RMAPS_BASE_VERSION_2_0_0,
"load_balance", /* MCA component name */
ORTE_MAJOR_VERSION, /* MCA component major version */
ORTE_MINOR_VERSION, /* MCA component minor version */
ORTE_RELEASE_VERSION, /* MCA component release version */
orte_rmaps_lb_open, /* component open */
orte_rmaps_lb_close, /* component close */
orte_rmaps_lb_query /* component query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
/**
* component open/close/init function
*/
static int orte_rmaps_lb_open(void)
{
return ORTE_SUCCESS;
}
static int orte_rmaps_lb_query(mca_base_module_t **module, int *priority)
{
/* the RMAPS framework is -only- opened on HNP's,
* so no need to check for that here
*/
/* if load balancing, or any nperxxx, was requested, then we must be selected */
if (orte_rmaps_base.loadbalance ||
0 < orte_rmaps_base.npernode ||
0 < orte_rmaps_base.nperboard ||
0 < orte_rmaps_base.npersocket) {
*priority = 1000; /* must be selected */
*module = (mca_base_module_t *)&orte_rmaps_load_balance_module;
return ORTE_SUCCESS;
}
/* otherwise, ignore us */
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
/**
* Close all subsystems.
*/
static int orte_rmaps_lb_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -72,6 +72,7 @@ static int map_app_by_node(orte_app_context_t* app,
opal_list_item_t *next;
orte_node_t *node;
orte_std_cntr_t num_alloc = 0;
orte_proc_t *proc;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
@ -118,8 +119,8 @@ static int map_app_by_node(orte_app_context_t* app,
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
/* pass the base slot list in case it was provided */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx,
nodes, jdata->map->oversubscribe, true))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
nodes, jdata->map->oversubscribe, true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -130,6 +131,9 @@ static int map_app_by_node(orte_app_context_t* app,
return rc;
}
}
if (NULL != orte_rmaps_base.slot_list) {
proc->slot_list = strdup(orte_rmaps_base.slot_list);
}
++num_alloc;
cur_node_item = next;
}
@ -150,6 +154,7 @@ static int map_app_by_slot(orte_app_context_t* app,
orte_std_cntr_t i, num_slots_to_take, num_alloc = 0;
orte_node_t *node;
opal_list_item_t *next;
orte_proc_t *proc;
/* This loop continues until all procs have been mapped or we run
out of resources. We determine that we have "run out of
@ -211,7 +216,7 @@ static int map_app_by_slot(orte_app_context_t* app,
/* check if we are in npernode mode - if so, then set the num_slots_to_take
* to the num_per_node
*/
if (jdata->map->pernode) {
if (0 < jdata->map->npernode) {
num_slots_to_take = jdata->map->npernode;
}
@ -223,8 +228,8 @@ static int map_app_by_slot(orte_app_context_t* app,
continue;
}
/* pass the base slot list in case it was provided */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start+num_alloc, orte_rmaps_base.slot_list, app->idx,
nodes, jdata->map->oversubscribe, true))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
nodes, jdata->map->oversubscribe, true, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -235,6 +240,9 @@ static int map_app_by_slot(orte_app_context_t* app,
return rc;
}
}
if (NULL != orte_rmaps_base.slot_list) {
proc->slot_list = strdup(orte_rmaps_base.slot_list);
}
/* Update the rank */
++num_alloc;
/* track #slots taken */
@ -279,6 +287,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
orte_rmaps_rank_file_map_t *rfmap;
orte_std_cntr_t slots_per_node, relative_index, tmp_cnt;
int rc;
orte_proc_t *proc;
/* convenience def */
map = jdata->map;
@ -303,7 +312,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
}
/* likewise, we only support pernode options for a single app_context */
if (map->pernode && 1 < jdata->num_apps) {
if (0 < map->npernode && 1 < jdata->num_apps) {
orte_show_help("help-rmaps_rank_file.txt", "orte-rmaps-rf:multi-apps-and-zero-np",
true, jdata->num_apps, NULL);
rc = ORTE_ERR_SILENT;
@ -349,7 +358,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
/* we already checked for sanity, so these are okay to just do here */
if (map->pernode && map->npernode == 1) {
if (map->npernode == 1) {
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes
* (b) if -np was provided AND #procs > #nodes, then error out
@ -365,7 +374,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (map->pernode && map->npernode > 1) {
} else if (map->npernode > 1) {
/* first, let's check to see if there are enough slots/node to
* meet the request - error out if not
*/
@ -447,8 +456,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
orte_show_help("help-rmaps_rank_file.txt","no-slot-list", true, rank, rfmap->node_name);
return ORTE_ERR_SILENT;
}
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rank, rfmap->slot_list,
app->idx, &node_list, jdata->map->oversubscribe, true))) {
proc = NULL;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
&node_list, jdata->map->oversubscribe, true, &proc))) {
if (ORTE_ERR_NODE_FULLY_USED != rc) {
/* if this is a true error and not the node just being
* full, then report the error and abort
@ -457,6 +467,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
return rc;
}
}
proc->slot_list = strdup(rfmap->slot_list);
jdata->num_procs++;
}
/* update the starting point */
@ -517,7 +528,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
/* if no bookmark, then just start at the beginning of the list */
cur_node_item = opal_list_get_first(&node_list);
}
if (map->policy & ORTE_RMAPS_BYNODE) {
if (map->policy & ORTE_MAPPING_BYNODE) {
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
} else {
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
@ -542,8 +553,14 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
/* update the job's number of procs */
jdata->num_procs = total_procs;
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -37,7 +37,6 @@
* Local variable
*/
static opal_list_item_t *cur_node_item = NULL;
static orte_vpid_t vpid_start = 0;
static char *orte_getline(FILE *fp);
@ -51,24 +50,22 @@ static int rr_map_default(orte_job_t *jdata, orte_app_context_t *app,
cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata);
/* now perform the mapping */
if (ORTE_RMAPS_BYNODE & jdata->map->policy) {
if (ORTE_MAPPING_BYNODE & jdata->map->policy) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_bynode(jdata, app, node_list,
num_procs, vpid_start,
cur_node_item))) {
num_procs, cur_node_item))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_byslot(jdata, app, node_list,
num_procs, vpid_start,
cur_node_item, 0))) {
num_procs, cur_node_item))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* update the starting vpid */
vpid_start += num_procs;
/* update number of procs */
jdata->num_procs += num_procs;
return ORTE_SUCCESS;
}
@ -123,7 +120,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
float avgload, minload;
orte_node_t *node, *nd=NULL, *oldnode;
orte_rmaps_res_ftgrp_t *ftgrp, *target;
orte_vpid_t totprocs, lowprocs;
orte_vpid_t totprocs, lowprocs, num_assigned;
FILE *fp;
char *ftinput;
int grp;
@ -275,8 +272,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nd->name));
/* put proc on the found node */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx,
NULL, jdata->map->oversubscribe, false))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
NULL, jdata->map->oversubscribe, false, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error
*/
@ -290,7 +287,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
/* update the node and local ranks so static ports can
* be properly selected if active
*/
orte_rmaps_base_update_usage(jdata, oldnode, nd, proc);
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
continue;
}
/* if we did find a target, re-map the proc to the lightest loaded
@ -313,8 +310,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name));
OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */
/* put proc on the found node */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, proc->name.vpid, NULL, proc->app_idx,
NULL, jdata->map->oversubscribe, false))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
NULL, jdata->map->oversubscribe, false, &proc))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error
*/
@ -328,7 +325,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
/* update the node and local ranks so static ports can
* be properly selected if active
*/
orte_rmaps_base_update_usage(jdata, oldnode, nd, proc);
orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc);
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata->map))) {
@ -354,7 +351,6 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
ORTE_JOBID_PRINT(jdata->jobid)));
/* start at the beginning... */
vpid_start = 0;
jdata->num_procs = 0;
map = jdata->map;
@ -363,6 +359,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
num_assigned = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
@ -434,7 +431,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
OPAL_OUTPUT_VERBOSE((2, orte_rmaps_base.rmaps_output,
"%s rmaps:resilient: no available fault group - mapping rr",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-vpid_start))) {
if (ORTE_SUCCESS != (rc = rr_map_default(jdata, app, &node_list, app->num_procs-num_assigned))) {
goto error;
}
goto cleanup;
@ -455,8 +452,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
target->ftgrp, nd->name));
/* put proc on that node */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, vpid_start, NULL, app->idx,
&node_list, jdata->map->oversubscribe, false))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe, false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error
*/
@ -466,7 +463,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
}
}
/* track number of procs mapped */
vpid_start++;
num_assigned++;
/* flag this fault group as used */
target->used = true;
@ -484,6 +481,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
}
cleanup:
/* track number of procs */
jdata->num_procs += app->num_procs;
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/
@ -493,11 +492,14 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
OBJ_DESTRUCT(&node_list);
}
/* update the number of procs in the job */
jdata->num_procs = vpid_start;
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -25,32 +25,27 @@
#include "opal/class/opal_pointer_array.h"
#include "orte/runtime/orte_globals.h"
/*
* General MAP types - instanced in runtime/orte_globals_class_instances.h
*/
BEGIN_C_DECLS
/*
* Define flags indicating the policy used to perform the map
*/
#define ORTE_RMAPS_NOPOL 0x00
#define ORTE_RMAPS_BYNODE 0x01
#define ORTE_RMAPS_BYSLOT 0x02
#define ORTE_RMAPS_BYUSER 0x04
#define ORTE_RMAPS_NO_USE_LOCAL 0x08
/*
* Structure that represents the mapping of a job to an
* allocated set of resources.
*/
struct orte_job_map_t {
opal_object_t super;
/* save the mapping configuration */
uint8_t policy;
bool pernode;
orte_std_cntr_t npernode;
/* user-specified mapping params */
orte_mapping_policy_t policy;
int npernode;
int nperboard;
int npersocket;
int16_t cpus_per_rank;
int16_t stride;
bool oversubscribe;
bool display_map;
bool cpu_lists;

Просмотреть файл

@ -48,56 +48,13 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
int i;
opal_list_t node_list;
opal_list_item_t *item;
orte_vpid_t vpid_start;
orte_std_cntr_t num_nodes, num_slots;
int rc;
orte_std_cntr_t slots_per_node;
int ppn = 0;
opal_list_item_t *cur_node_item;
/* start at the beginning... */
vpid_start = 0;
jdata->num_procs = 0;
/* if loadbalancing is requested, then we need to compute
* the #procs/node - note that this cannot be done
* if we are doing pernode or if #procs was not given
*/
if (orte_rmaps_base.loadbalance && !jdata->map->pernode) {
float res;
/* compute total #procs we are going to add */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (0 == app->num_procs) {
/* can't do it - tell user and quit */
orte_show_help("help-orte-rmaps-rr.txt",
"orte-rmaps-rr:loadbalance-and-zero-np",
true);
rc = ORTE_ERR_SILENT;
goto error;
}
ppn += app->num_procs;
}
/* get the total avail nodes and the number
* of procs already using them
*/
num_nodes=0;
for (i=0; i < orte_node_pool->size; i++) {
if (NULL == opal_pointer_array_get_item(orte_node_pool, i)) {
continue;
}
num_nodes++;
}
/* compute the balance */
res = ((float)ppn / num_nodes);
ppn = ppn / num_nodes;
if (0 < (res-ppn)) {
ppn++;
}
}
/* cycle through the app_contexts, mapping them sequentially */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
@ -130,83 +87,22 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
/* if a bookmark exists from some prior mapping, set us to start there */
cur_node_item = orte_rmaps_base_get_starting_point(&node_list, jdata);
if (jdata->map->pernode && jdata->map->npernode == 1) {
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes
* (b) if -np was provided AND #procs > #nodes, then error out
* (c) if -np was provided AND #procs <= #nodes, then launch
* the specified #procs one/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
app->num_procs = num_nodes;
} else if (app->num_procs > num_nodes) {
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs",
true, app->num_procs, num_nodes, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (jdata->map->pernode && jdata->map->npernode > 1) {
/* first, let's check to see if there are enough slots/node to
* meet the request - error out if not
*/
slots_per_node = num_slots / num_nodes;
if (jdata->map->npernode > slots_per_node) {
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-not-enough-slots",
true, jdata->map->npernode, slots_per_node, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the n/node * #nodes
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
* the specified #procs n/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
/* set the num_procs to equal the specified num/node * the number of nodes */
app->num_procs = jdata->map->npernode * num_nodes;
} else if (app->num_procs > (jdata->map->npernode * num_nodes)) {
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-too-many-procs",
true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (0 == app->num_procs) {
if (jdata->map->policy & ORTE_RMAPS_BYUSER) {
/* we can't handle this - it should have been set when we got
* the map info. If it wasn't, then we can only error out
*/
orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-np-and-user-map",
true, app->num_procs, jdata->map->npernode, num_nodes, num_slots, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
/** set the num_procs to equal the number of slots on these mapped nodes */
if (0 == app->num_procs) {
/* set the num_procs to equal the number of slots on these mapped nodes */
app->num_procs = num_slots;
}
/** track the total number of processes we mapped */
/* track the total number of processes we mapped */
jdata->num_procs += app->num_procs;
/* Make assignments */
if (jdata->map->policy & ORTE_RMAPS_BYUSER) {
rc = ORTE_ERR_NOT_IMPLEMENTED;
goto error;
} else if (jdata->map->policy & ORTE_RMAPS_BYNODE) {
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
rc = orte_rmaps_base_map_bynode(jdata, app, &node_list,
app->num_procs, vpid_start,
cur_node_item);
app->num_procs, cur_node_item);
} else {
rc = orte_rmaps_base_map_byslot(jdata, app, &node_list,
app->num_procs, vpid_start,
cur_node_item, ppn);
app->num_procs, cur_node_item);
}
/* update the starting vpid for the next app_context */
vpid_start += app->num_procs;
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto error;
@ -221,8 +117,14 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
OBJ_DESTRUCT(&node_list);
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
/* compute vpids and add proc objects to the job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -59,14 +59,15 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
orte_job_map_t *map;
orte_app_context_t *app;
orte_std_cntr_t i, j;
opal_list_item_t *item, *next, *cur_node_item;
orte_node_t *node, *nd;
opal_list_item_t *item;
orte_node_t *node, *nd, *save;
orte_vpid_t vpid;
orte_std_cntr_t num_nodes;
int rc;
opal_list_t *default_node_list=NULL;
opal_list_t *node_list=NULL;
orte_proc_t *proc;
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:seq mapping job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -87,6 +88,9 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
/* start at the beginning... */
vpid = 0;
jdata->num_procs = 0;
if (NULL != default_node_list) {
save = (orte_node_t*)opal_list_get_first(default_node_list);
}
/* cycle through the app_contexts, mapping them sequentially */
for(i=0; i < jdata->num_apps; i++) {
@ -103,12 +107,14 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
ORTE_ERROR_LOG(rc);
goto error;
}
nd = (orte_node_t*)opal_list_get_first(node_list);
} else {
node_list = default_node_list;
nd = save;
}
/* check for nolocal and remove the head node, if required */
if (map->policy & ORTE_RMAPS_NO_USE_LOCAL) {
if (map->policy & ORTE_MAPPING_NO_USE_LOCAL) {
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item) ) {
@ -132,43 +138,17 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
return ORTE_ERR_SILENT;
}
/* if a bookmark exists from some prior mapping, set us to start there */
cur_node_item = orte_rmaps_base_get_starting_point(node_list, jdata);
/* if num_procs wasn't specified, set it now */
if (0 == app->num_procs) {
app->num_procs = num_nodes;
}
for (i=0; i < app->num_procs; i++) {
/* see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
* used) as we cycle through the loop
*/
if(0 >= opal_list_get_size(node_list) ) {
/* Everything is at max usage! :( */
orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
/* Save the next node we can use before claiming slots, since
* we may need to prune the nodes list removing overused nodes.
* Wrap around to beginning if we are at the end of the list
*/
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
next = opal_list_get_first(node_list);
}
else {
next = opal_list_get_next(cur_node_item);
}
/* find this node on the global array - this is necessary so
* that our mapping gets saved on that array as the objects
* returned by the hostfile function are -not- on the array
*/
node = NULL;
nd = (orte_node_t*)cur_node_item;
for (j=0; j < orte_node_pool->size; j++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
continue;
@ -186,42 +166,46 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
goto error;
}
/* assign next vpid to this node - do NOT allow claim_slot to remove
/* assign proc to this node - do NOT allow claim_slot to remove
* an oversubscribed node from the list!
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
vpid, NULL, app->idx,
jdata->map->cpus_per_rank, app->idx,
node_list,
jdata->map->oversubscribe,
false))) {
false, &proc))) {
if (ORTE_ERR_NODE_FULLY_USED != rc) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
/* increment the vpid */
vpid++;
/* assign the vpid */
proc->name.vpid = vpid++;
/* add to the jdata proc array */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* move to next node */
cur_node_item = next;
nd = (orte_node_t*)opal_list_get_next((opal_list_item_t*)nd);
}
/** track the total number of processes we mapped */
jdata->num_procs += app->num_procs;
/* update the bookmark */
jdata->bookmark = (orte_node_t*)cur_node_item;
/* cleanup the node list if it came from this app_context */
if (node_list != default_node_list) {
while(NULL != (item = opal_list_remove_first(node_list))) {
while (NULL != (item = opal_list_remove_first(node_list))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(node_list);
} else {
save = nd;
}
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -110,8 +110,8 @@ static int map_app_by_node(
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
nodes, jdata->map->oversubscribe, true))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
nodes, jdata->map->oversubscribe, true, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -212,13 +212,13 @@ static int map_app_by_slot(
/* check if we are in npernode mode - if so, then set the num_slots_to_take
* to the num_per_node
*/
if (jdata->map->pernode) {
if (0 < jdata->map->npernode) {
num_slots_to_take = jdata->map->npernode;
}
for( i = 0; i < num_slots_to_take; ++i) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, NULL, app->idx,
nodes, jdata->map->oversubscribe, true))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, 1, app->idx,
nodes, jdata->map->oversubscribe, true, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -426,7 +426,7 @@ static int topo_map(orte_job_t *jdata)
}
proceed:
if (map->pernode && map->npernode == 1) {
if (map->npernode == 1) {
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes
* (b) if -np was provided AND #procs > #nodes, then error out
@ -442,7 +442,7 @@ static int topo_map(orte_job_t *jdata)
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (map->pernode && map->npernode > 1) {
} else if (map->npernode > 1) {
/* first, let's check to see if there are enough slots/node to
* meet the request - error out if not
*/
@ -473,11 +473,11 @@ static int topo_map(orte_job_t *jdata)
/** set the num_procs to equal the number of slots on these mapped nodes - if
user has specified "-bynode", then set it to the number of nodes
*/
if (map->policy & ORTE_RMAPS_BYNODE) {
if (map->policy & ORTE_MAPPING_BYNODE) {
app->num_procs = num_nodes;
} else if (map->policy & ORTE_RMAPS_BYSLOT) {
} else if (map->policy & ORTE_MAPPING_BYSLOT) {
app->num_procs = num_slots;
} else if (map->policy & ORTE_RMAPS_BYUSER) {
} else {
/* we can't handle this - it should have been set when we got
* the map info. If it wasn't, then we can only error out
*/
@ -492,10 +492,7 @@ static int topo_map(orte_job_t *jdata)
jdata->num_procs += app->num_procs;
/* Make assignments */
if (map->policy == ORTE_RMAPS_BYUSER) {
rc = ORTE_ERR_NOT_IMPLEMENTED;
goto error;
} else if (map->policy == ORTE_RMAPS_BYNODE) {
if (map->policy == ORTE_MAPPING_BYNODE) {
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
} else {
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
@ -522,7 +519,7 @@ static int topo_map(orte_job_t *jdata)
}
/* compute and save convenience values */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_usage(jdata))) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -280,7 +280,6 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
/* copy data into it */
(*dest)->policy = src->policy;
(*dest)->pernode = src->pernode;
(*dest)->npernode = src->npernode;
(*dest)->oversubscribe = src->oversubscribe;
(*dest)->display_map = src->display_map;

Просмотреть файл

@ -407,6 +407,15 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
return rc;
}
/* do not pack the local board, socket, and core info */
/* pack the cpu set info */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(nodes[i]->cpu_set)), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not pack the username */
}
return ORTE_SUCCESS;
@ -814,13 +823,7 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
for (i=0; i < num_vals; i++) {
/* pack the policy used to generate it */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the pernode flag */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->pernode), 1, OPAL_BOOL))) {
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->policy), 1, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -362,6 +362,11 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
}
}
asprintf(&tmp2, "%s\n%s\tNum boards: %ld\tNum sockets/board: %ld\tNum cores/socket: %ld", tmp, pfx2,
(long)src->boards, (long)src->sockets_per_board, (long)src->cores_per_socket);
free(tmp);
tmp = tmp2;
if (NULL == src->daemon) {
asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2,
"Not defined", src->daemon_launched ? "True" : "False");
@ -377,8 +382,9 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
free(tmp);
tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
(long)src->slots_alloc, (long)src->slots_max);
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld:\tCpu set: %s", tmp, pfx2,
(long)src->slots_alloc, (long)src->slots_max,
(NULL == src->cpu_set) ? "NULL" : src->cpu_set);
free(tmp);
tmp = tmp2;
@ -644,9 +650,8 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
asprintf(&pfx, "%s\t", pfx2);
if (orte_devel_level_output) {
asprintf(&tmp, "\n%sMap generated by mapping policy: %x\n%s\tPernode: %s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
pfx2, src->policy, pfx2,
(src->pernode) ? "TRUE" : "FALSE", (long)src->npernode,
asprintf(&tmp, "\n%sMap generated by mapping policy: %04x\n%s\tNpernode: %ld\tOversubscribe allowed: %s\tCPU Lists: %s",
pfx2, src->policy, pfx2, (long)src->npernode,
(src->oversubscribe) ? "TRUE" : "FALSE",
(src->cpu_lists) ? "TRUE" : "FALSE");

Просмотреть файл

@ -422,6 +422,16 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
return rc;
}
/* do not unpack the board, socket, and core info */
/* unpack the cpu set */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(nodes[i]->cpu_set), &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not unpack the username */
}
return ORTE_SUCCESS;
@ -883,15 +893,7 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
/* unpack the policy */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->policy), &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the pernode flag */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
&(maps[i]->pernode), &n, OPAL_BOOL))) {
&(maps[i]->policy), &n, ORTE_MAPPING_POLICY))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -27,6 +27,7 @@
#endif
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/paffinity/paffinity.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/class/opal_pointer_array.h"
@ -132,6 +133,17 @@ bool orte_orted_exit_with_barrier = true;
/* report launch progress */
bool orte_report_launch_progress = false;
/* cluster hardware info */
uint8_t orte_default_num_boards;
uint8_t orte_default_num_sockets_per_board;
uint8_t orte_default_num_cores_per_socket;
/* allocation specification */
char *orte_default_cpu_set;
/* default rank assigment and binding policy */
orte_mapping_policy_t orte_default_mapping_policy = 0;
#endif /* !ORTE_DISABLE_FULL_RTE */
int orte_debug_output = -1;
@ -670,6 +682,16 @@ static void orte_node_construct(orte_node_t* node)
node->slots_inuse = 0;
node->slots_alloc = 0;
node->slots_max = 0;
node->boards = orte_default_num_boards;
node->sockets_per_board = orte_default_num_sockets_per_board;
node->cores_per_socket = orte_default_num_cores_per_socket;
if (NULL != orte_default_cpu_set) {
node->cpu_set = strdup(orte_default_cpu_set);
} else {
node->cpu_set = NULL;
}
node->username = NULL;
}
@ -702,6 +724,10 @@ static void orte_node_destruct(orte_node_t* node)
}
OBJ_RELEASE(node->procs);
if (NULL != node->cpu_set) {
free(node->cpu_set);
node->cpu_set = NULL;
}
if (NULL != node->username) {
free(node->username);
node->username = NULL;
@ -871,9 +897,12 @@ OBJ_CLASS_INSTANCE(orte_jmap_t,
static void orte_job_map_construct(orte_job_map_t* map)
{
map->policy = ORTE_RMAPS_BYSLOT; /* default to byslot mapping as per orterun options */
map->pernode = false;
map->policy = 0;
map->npernode = 0;
map->nperboard = 0;
map->npersocket = 0;
map->cpus_per_rank = 1;
map->stride = 1;
map->oversubscribe = true; /* default to allowing oversubscribe */
map->display_map = false;
map->cpu_lists = false;

Просмотреть файл

@ -38,7 +38,6 @@
#include "opal/class/opal_value_array.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/util/proc_info.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/runtime.h"
@ -141,6 +140,7 @@ typedef struct orte_job_t orte_job_t;
* defining it - resolves potential circular definition
*/
struct orte_proc_t;
struct orte_job_map_t;
/************/
/**
@ -241,6 +241,14 @@ typedef struct {
specified limit. For example, if we have two processors, we
may want to allow up to four processes but no more. */
orte_std_cntr_t slots_max;
/* number of physical boards in the node - defaults to 1 */
uint8_t boards;
/* number of sockets on each board - defaults to 1 */
uint8_t sockets_per_board;
/* number of cores per socket - defaults to 1 */
uint8_t cores_per_socket;
/* cpus on this node that are assigned for our use */
char *cpu_set;
/** Username on this node, if specified */
char *username;
} orte_node_t;
@ -258,6 +266,31 @@ typedef uint8_t orte_job_controls_t;
#define ORTE_JOB_CONTROL_FORWARD_COMM 0x20
#define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40
typedef uint16_t orte_mapping_policy_t;
#define ORTE_MAPPING_POLICY OPAL_UINT16
/* put the rank assignment method in the upper 8 bits */
#define ORTE_MAPPING_NOPOL 0x0100
#define ORTE_MAPPING_BYNODE 0x0200
#define ORTE_MAPPING_BYSLOT 0x0400
#define ORTE_MAPPING_BYSOCKET 0x0800
#define ORTE_MAPPING_BYBOARD 0x1000
#define ORTE_MAPPING_NO_USE_LOCAL 0x2000
#define ORTE_MAPPING_NPERXXX 0x4000
/* nice macro for setting these */
#define ORTE_SET_MAPPING_POLICY(pol) \
orte_default_mapping_policy = (orte_default_mapping_policy & 0x00ff) | (pol);
#define ORTE_ADD_MAPPING_POLICY(pol) \
orte_default_mapping_policy |= (pol);
/* put the binding policy in the lower 8 bits, using the paffinity values */
#define ORTE_BIND_TO_NONE (uint16_t)OPAL_PAFFINITY_DO_NOT_BIND
#define ORTE_BIND_TO_CORE (uint16_t)OPAL_PAFFINITY_BIND_TO_CORE
#define ORTE_BIND_TO_SOCKET (uint16_t)OPAL_PAFFINITY_BIND_TO_SOCKET
#define ORTE_BIND_TO_BOARD (uint16_t)OPAL_PAFFINITY_BIND_TO_BOARD
/* nice macro for setting these */
#define ORTE_SET_BINDING_POLICY(pol) \
orte_default_mapping_policy = (orte_default_mapping_policy & 0xff00) | (pol);
/* error manager callback function */
typedef void (*orte_err_cb_fn_t)(orte_process_name_t *proc, orte_proc_state_t state, void *cbdata);
@ -285,7 +318,7 @@ typedef struct {
/* array of pointers to procs in this job */
opal_pointer_array_t *procs;
/* map of the job */
orte_job_map_t *map;
struct orte_job_map_t *map;
/* bookmark for where we are in mapping - this
* indicates the node where we stopped
*/
@ -531,6 +564,17 @@ ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
/* whether or not to report launch progress */
ORTE_DECLSPEC extern bool orte_report_launch_progress;
/* cluster hardware info */
ORTE_DECLSPEC extern uint8_t orte_default_num_boards;
ORTE_DECLSPEC extern uint8_t orte_default_num_sockets_per_board;
ORTE_DECLSPEC extern uint8_t orte_default_num_cores_per_socket;
/* allocation specification */
ORTE_DECLSPEC extern char *orte_default_cpu_set;
/* default rank assigment and binding policy */
ORTE_DECLSPEC extern orte_mapping_policy_t orte_default_mapping_policy;
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -28,6 +28,7 @@
#include <stdio.h>
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/util/output.h"
#include "orte/util/proc_info.h"
@ -38,6 +39,7 @@
int orte_register_params(void)
{
int value, tmp;
char *strval;
mca_base_param_reg_int_name("orte", "base_help_aggregate",
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
@ -297,6 +299,48 @@ int orte_register_params(void)
orte_startup_timeout = 2000; /* default to 2 seconds */
}
}
/* cluster hardware info */
mca_base_param_reg_int_name("orte", "num_boards",
"Number of processor boards/node (1-256) [default: 1]",
false, false, 1, &value);
orte_default_num_boards = (uint8_t)value;
if (OPAL_SUCCESS != opal_paffinity_base_get_socket_info(&value)) {
value = 1;
}
mca_base_param_reg_int_name("orte", "num_sockets",
"Number of sockets/board (1-256) [default: auto-sensed by mpirun or 1]",
false, false, value, &value);
orte_default_num_sockets_per_board = (uint8_t)value;
if (OPAL_SUCCESS != opal_paffinity_base_get_core_info(0, &value)) {
value = 1;
}
mca_base_param_reg_int_name("orte", "num_cores",
"Number of cores/socket (1-256) [default: auto-sensed by mpirun or 1]",
false, false, value, &value);
orte_default_num_cores_per_socket = (uint8_t)value;
/* cpu allocation specification */
mca_base_param_reg_string_name("orte", "cpu_set",
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
false, false, NULL, &orte_default_cpu_set);
/* binding specification - this will be overridden by any cmd line directive, and
* ignored unless opal_paffinity_alone is set
*/
mca_base_param_reg_string_name("orte", "process_binding",
"Policy for binding processes [core | socket | board (default: none)]",
false, false, NULL, &strval);
if (NULL != strval) {
if (0 == strcmp(strval, "socket")) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
} else if (0 == strcmp(strval, "board")) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
} else if (0 == strcmp(strval, "core")) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
}
}
#endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS;

Просмотреть файл

@ -120,6 +120,7 @@
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
@ -512,7 +513,6 @@ static void check_debugger(int fd, short event, void *arg)
* one debugger daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->pernode = true;
jdata->map->npernode = 1;
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);

Просмотреть файл

@ -50,6 +50,7 @@
#include "opal/event/event.h"
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/base.h"
#include "opal/mca/paffinity/base/base.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/basename.h"
@ -255,10 +256,16 @@ static opal_cmd_line_init_t cmd_line_init[] = {
/* Mapping options */
{ NULL, NULL, NULL, '\0', "bynode", "bynode", 0,
&orterun_globals.by_node, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to allocate/map processes round-robin by node" },
"Whether to assign processes round-robin by node" },
{ NULL, NULL, NULL, '\0', "byslot", "byslot", 0,
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to allocate/map processes round-robin by slot (the default)" },
"Whether to assign processes round-robin by slot (the default)" },
{ NULL, NULL, NULL, '\0', "bysocket", "bysocket", 0,
&orterun_globals.by_socket, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to assign processes round-robin by socket" },
{ NULL, NULL, NULL, '\0', "byboard", "byboard", 0,
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to assign processes round-robin by board (equivalent to bynode if only 1 board/node)" },
{ "rmaps", "base", "pernode", '\0', "pernode", "pernode", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
@ -286,7 +293,30 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ "rmaps", "base", "no_schedule_local", '\0', "nolocal", "nolocal", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Do not run any MPI applications on the local node" },
{ "rmaps", "base", "cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Number of cpus to use for each rank [default=1]" },
{ "rmaps", "base", "n_perboard", '\0', "nperboard", "nperboard", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per board on all allocated nodes" },
{ "rmaps", "base", "n_persocket", '\0', "npersocket", "npersocket", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per socket on all allocated nodes" },
/* binding options */
{ NULL, NULL, NULL, '\0', "bind-to-core", "bind-to-core", 0,
&orterun_globals.bind_to_core, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to bind processes to specific cores (the default)" },
{ NULL, NULL, NULL, '\0', "bind-to-board", "bind-to-board", 0,
&orterun_globals.bind_to_board, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to bind processes to specific boards (meaningless on 1 board/node)" },
{ NULL, NULL, NULL, '\0', "bind-to-socket", "bind-to-socket", 0,
&orterun_globals.bind_to_socket, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to bind processes to sockets" },
{ "rmaps", "base", "stride", '\0', "stride", "stride", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"When binding multiple cores to a rank, the step size to use between cores [default: 1]" },
/* Allocation options */
{ "ras", "base", "display_alloc", '\0', "display-allocation", "display-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
@ -294,6 +324,20 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ "ras", "base", "display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Display a detailed list (mostly intended for developers) of the allocation being used by this job"},
{ "orte", "cpu", "set", '\0', "cpu-set", "cpu-set", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"},
/* cluster hardware info */
{ "orte", "num", "boards", '\0', "num-boards", "num-boards", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Number of processor boards/node (1-256) [default: 1]"},
{ "orte", "num", "sockets", '\0', "num-sockets", "num-sockets", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Number of sockets/board (1-256) [default: 1]"},
{ "orte", "num", "cores", '\0', "num-cores", "num-cores", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Number of cores/socket (1-256) [default: 1]"},
/* mpiexec-like arguments */
{ NULL, NULL, NULL, '\0', "wdir", "wdir", 1,
@ -468,6 +512,7 @@ int orterun(int argc, char *argv[])
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* check what user wants us to do with stdin */
if (0 == strcmp(orterun_globals.stdin_target, "all")) {
jdata->stdin_target = ORTE_VPID_WILDCARD;
@ -1144,6 +1189,11 @@ static int init_globals(void)
orterun_globals.quiet = false;
orterun_globals.by_node = false;
orterun_globals.by_slot = false;
orterun_globals.by_board = false;
orterun_globals.by_socket = false;
orterun_globals.bind_to_core = false;
orterun_globals.bind_to_board = false;
orterun_globals.bind_to_socket = false;
orterun_globals.debugger = false;
orterun_globals.num_procs = 0;
if( NULL != orterun_globals.env_val )
@ -1171,8 +1221,6 @@ static int init_globals(void)
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
{
int id;
/* print version if requested. Do this before check for help so
that --version --help works as one might expect. */
if (orterun_globals.version &&
@ -1237,31 +1285,30 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
orte_run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs);
}
/* Allocate and map by node or by slot? Shortcut for setting an
MCA param. */
/* Don't initialize the MCA parameter here unless we have to,
* since it really should be initialized in rmaps_base_open */
if (orterun_globals.by_node || orterun_globals.by_slot) {
char *policy = NULL;
id = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
"Scheduling policy for RMAPS. [slot | node]",
false, false, "slot", &policy);
if (orterun_globals.by_node) {
orterun_globals.by_slot = false;
mca_base_param_set_string(id, "node");
} else {
orterun_globals.by_slot = true;
mca_base_param_set_string(id, "slot");
}
free(policy);
/* extract any rank assignment policy directives */
if (orterun_globals.by_node) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYNODE);
} else if (orterun_globals.by_board) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYBOARD);
} else if (orterun_globals.by_socket) {
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSOCKET);
} else {
/* byslot is the default */
ORTE_SET_MAPPING_POLICY(ORTE_MAPPING_BYSLOT);
}
else {
/* Default */
orterun_globals.by_slot = true;
/* extract any binding policy directives - they will
* be ignored unless paffinity_alone is set
*/
if (orterun_globals.bind_to_socket) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_SOCKET);
} else if (orterun_globals.bind_to_board) {
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_BOARD);
} else {
/* default to by-core */
ORTE_SET_BINDING_POLICY(ORTE_BIND_TO_CORE);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -43,6 +43,11 @@ struct orterun_globals_t {
bool exit;
bool by_node;
bool by_slot;
bool by_board;
bool by_socket;
bool bind_to_core;
bool bind_to_board;
bool bind_to_socket;
bool debugger;
int num_procs;
char *env_val;

Просмотреть файл

@ -93,3 +93,19 @@ The requested number of empty hosts was not available - the system was short by
Please recheck your allocation - further information is available on the
orte_hosts man page.
[boards]
Open RTE detected a bad parameter in the hostfile:
%s
The boards parameter is less than 0:
boards=%d
[sockets]
Open RTE detected a bad parameter in the hostfile:
%s
The sockets parameter is less than 0:
sockets=%d
[cores]
Open RTE detected a bad parameter in the hostfile:
%s
The cores parameter is less than 0:
cores=%d

Просмотреть файл

@ -261,6 +261,49 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
node->username = hostfile_parse_string();
break;
case ORTE_HOSTFILE_BOARDS:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "boards",
true,
cur_hostfile_name, rc);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
node->boards = rc;
break;
case ORTE_HOSTFILE_SOCKETS_PER_BOARD:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "sockets",
true,
cur_hostfile_name, rc);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
node->sockets_per_board = rc;
break;
case ORTE_HOSTFILE_CORES_PER_SOCKET:
rc = hostfile_parse_int();
if (rc < 0) {
orte_show_help("help-hostfile.txt", "cores",
true,
cur_hostfile_name, rc);
OBJ_RELEASE(node);
return ORTE_ERROR;
}
node->cores_per_socket = rc;
break;
case ORTE_HOSTFILE_CPU_SET:
if (NULL != node->cpu_set) {
free(node->cpu_set);
}
node->cpu_set = hostfile_parse_string();
break;
case ORTE_HOSTFILE_COUNT:
case ORTE_HOSTFILE_CPU:
case ORTE_HOSTFILE_SLOTS:

Просмотреть файл

@ -55,22 +55,26 @@ extern orte_hostfile_value_t orte_util_hostfile_value;
#define YY_NO_UNPUT 1
#define YY_SKIP_YYWRAP 1
#define ORTE_HOSTFILE_DONE 0
#define ORTE_HOSTFILE_ERROR 1
#define ORTE_HOSTFILE_QUOTED_STRING 2
#define ORTE_HOSTFILE_EQUAL 3
#define ORTE_HOSTFILE_INT 4
#define ORTE_HOSTFILE_STRING 5
#define ORTE_HOSTFILE_CPU 6
#define ORTE_HOSTFILE_COUNT 7
#define ORTE_HOSTFILE_SLOTS 8
#define ORTE_HOSTFILE_SLOTS_MAX 9
#define ORTE_HOSTFILE_USERNAME 10
#define ORTE_HOSTFILE_IPV4 11
#define ORTE_HOSTFILE_HOSTNAME 12
#define ORTE_HOSTFILE_NEWLINE 13
#define ORTE_HOSTFILE_IPV6 14
#define ORTE_HOSTFILE_SLOT 15
#define ORTE_HOSTFILE_RELATIVE 16
#define ORTE_HOSTFILE_DONE 0
#define ORTE_HOSTFILE_ERROR 1
#define ORTE_HOSTFILE_QUOTED_STRING 2
#define ORTE_HOSTFILE_EQUAL 3
#define ORTE_HOSTFILE_INT 4
#define ORTE_HOSTFILE_STRING 5
#define ORTE_HOSTFILE_CPU 6
#define ORTE_HOSTFILE_COUNT 7
#define ORTE_HOSTFILE_SLOTS 8
#define ORTE_HOSTFILE_SLOTS_MAX 9
#define ORTE_HOSTFILE_USERNAME 10
#define ORTE_HOSTFILE_IPV4 11
#define ORTE_HOSTFILE_HOSTNAME 12
#define ORTE_HOSTFILE_NEWLINE 13
#define ORTE_HOSTFILE_IPV6 14
#define ORTE_HOSTFILE_SLOT 15
#define ORTE_HOSTFILE_RELATIVE 16
#define ORTE_HOSTFILE_BOARDS 17
#define ORTE_HOSTFILE_SOCKETS_PER_BOARD 18
#define ORTE_HOSTFILE_CORES_PER_SOCKET 19
#define ORTE_HOSTFILE_CPU_SET 20
#endif

Просмотреть файл

@ -120,6 +120,33 @@ username { orte_util_hostfile_value.sval = yytext;
"user_name" { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_USERNAME; }
boards { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_BOARDS; }
sockets { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
sockets_per_board { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
"sockets-per-board" { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_SOCKETS_PER_BOARD; }
cores { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
cores_per_socket { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
"cores-per-socket" { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CORES_PER_SOCKET; }
cpu_set { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CPU_SET; }
"cpu-set" { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_CPU_SET; }
\+n[0-9]+ { orte_util_hostfile_value.sval = yytext;
return ORTE_HOSTFILE_RELATIVE; }
\+[eE][\:][0-9]+ { orte_util_hostfile_value.sval = yytext;

Просмотреть файл

@ -47,6 +47,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
@ -472,20 +473,14 @@ char* orte_regex_encode_maps(orte_job_t *jdata)
char suffix, sfx;
orte_app_context_t *app;
/* this is only supported with regular maps - i.e., when
* the mapping is byslot or bynode. Irregular maps cannot
* be expressed in a regular expression
*
* Also only supported for one app_context
*/
if (jdata->map->policy & ORTE_RMAPS_BYUSER ||
jdata->num_apps > 1) {
/* this is only for one app_context */
if (jdata->num_apps > 1) {
return NULL;
}
/* determine the mapping policy */
byslot = true;
if (jdata->map->policy & ORTE_RMAPS_BYNODE) {
if (jdata->map->policy & ORTE_MAPPING_BYNODE) {
byslot = false;
}