Extend the mpirun options to support "--npernode N". This option tells the system to spawn N procs/node across all nodes in the allocation. If N is greater than the number of allocated slots, then the usual oversubscription logic will apply (i.e., the system will error out if oversubscription is not allowed, otherwise it will run with the sched_yield set to non-aggressive behavior).
In "--npernode" operation, the "-np" command line parameter is ignored. This commit was SVN r12826.
Этот коммит содержится в:
родитель
cf196ce420
Коммит
28ce8e5e5e
@ -61,6 +61,8 @@ extern "C" {
|
||||
bool oversubscribe;
|
||||
/** do we want one ppn if num_procs not specified */
|
||||
bool per_node;
|
||||
/** number of ppn for n_per_node mode */
|
||||
int n_per_node;
|
||||
/* do we not allow use of the localhost */
|
||||
bool no_use_local;
|
||||
/* display the map after it is computed */
|
||||
|
@ -53,6 +53,7 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
opal_list_item_t *item;
|
||||
orte_jobid_t *jptr, parent_job=ORTE_JOBID_INVALID;
|
||||
orte_job_map_t *map;
|
||||
orte_std_cntr_t scntr;
|
||||
int rc;
|
||||
|
||||
/* if we are not on the head node, use the proxy component */
|
||||
@ -132,6 +133,19 @@ int orte_rmaps_base_map_job(orte_jobid_t job, opal_list_t *attributes)
|
||||
}
|
||||
}
|
||||
|
||||
/* check n_pernode - add it if it was set by the environment. Note that this
|
||||
* attribute does convey a value as well
|
||||
*/
|
||||
if (0 < orte_rmaps_base.n_per_node) {
|
||||
scntr = (orte_std_cntr_t)orte_rmaps_base.n_per_node;
|
||||
if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(attributes, ORTE_RMAPS_N_PERNODE,
|
||||
ORTE_STD_CNTR, &scntr,
|
||||
ORTE_RMGR_ATTR_NO_OVERRIDE))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* check no_local - add it if it was set by the environment. Note that this
|
||||
* attribute only cares if it exists - its value is irrelevant and hence
|
||||
* not provided
|
||||
|
@ -99,8 +99,8 @@ int orte_rmaps_base_open(void)
|
||||
|
||||
/* Do we want one ppn if num_procs not specified */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_pernode",
|
||||
"Request one ppn if num procs not specified",
|
||||
false, false, 0, &value);
|
||||
"Launch one ppn as directed",
|
||||
false, false, (int)false, &value);
|
||||
|
||||
if ((int)true == value) {
|
||||
orte_rmaps_base.per_node = true;
|
||||
@ -108,6 +108,12 @@ int orte_rmaps_base_open(void)
|
||||
orte_rmaps_base.per_node = false;
|
||||
}
|
||||
|
||||
/* Do we want n ppn */
|
||||
param = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
|
||||
"Launch n procs/node",
|
||||
false, false, -1, &value);
|
||||
orte_rmaps_base.n_per_node = value;
|
||||
|
||||
/* Should we schedule on the local node or not? */
|
||||
|
||||
mca_base_param_reg_int_name("rmaps", "base_schedule_local",
|
||||
|
@ -36,6 +36,7 @@ extern "C" {
|
||||
/**** RMAPS ATTRIBUTES ***/
|
||||
#define ORTE_RMAPS_MAP_POLICY "orte-map-policy"
|
||||
#define ORTE_RMAPS_PERNODE "orte-map-pernode"
|
||||
#define ORTE_RMAPS_N_PERNODE "orte-map-n-pernode"
|
||||
#define ORTE_RMAPS_NO_USE_LOCAL "orte-map-no-use-local"
|
||||
#define ORTE_RMAPS_NO_OVERSUB "orte-map-no-oversubscribe"
|
||||
#define ORTE_RMAPS_DESIRED_MAPPER "orte-map-desired"
|
||||
|
@ -51,6 +51,7 @@
|
||||
*/
|
||||
static opal_list_item_t *cur_node_item = NULL;
|
||||
static opal_list_t fully_used_nodes;
|
||||
static orte_std_cntr_t num_per_node;
|
||||
|
||||
|
||||
/*
|
||||
@ -267,6 +268,7 @@ static int orte_rmaps_rr_process_attrs(opal_list_t *attributes)
|
||||
int rc;
|
||||
char *policy;
|
||||
orte_attribute_t *attr;
|
||||
orte_std_cntr_t *scptr;
|
||||
|
||||
mca_rmaps_round_robin_component.bynode = false; /* set default mapping policy */
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RMAPS_MAP_POLICY))) {
|
||||
@ -288,6 +290,20 @@ static int orte_rmaps_rr_process_attrs(opal_list_t *attributes)
|
||||
mca_rmaps_round_robin_component.bynode = true;
|
||||
}
|
||||
|
||||
mca_rmaps_round_robin_component.n_per_node = false;
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RMAPS_N_PERNODE))) {
|
||||
/* was provided - set boolean accordingly */
|
||||
mca_rmaps_round_robin_component.n_per_node = true;
|
||||
/* get the number of procs per node to launch */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&scptr, attr->value, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
num_per_node = *scptr;
|
||||
/* indicate that we are going to map this job bynode */
|
||||
mca_rmaps_round_robin_component.bynode = true;
|
||||
}
|
||||
|
||||
mca_rmaps_round_robin_component.no_use_local = false;
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RMAPS_NO_USE_LOCAL))) {
|
||||
/* was provided - set boolean accordingly */
|
||||
@ -468,6 +484,10 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
true, app->num_procs, num_nodes, NULL);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
} else if (mca_rmaps_round_robin_component.n_per_node) {
|
||||
/* set the num_procs to equal the specified num/node * the number of nodes */
|
||||
app->num_procs = num_per_node * num_nodes;
|
||||
modify_app_context = true;
|
||||
} else if (0 == app->num_procs) {
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes - if
|
||||
user has specified "-bynode", then set it to the number of nodes
|
||||
|
@ -38,6 +38,7 @@ struct orte_rmaps_round_robin_component_t {
|
||||
int priority;
|
||||
bool bynode;
|
||||
bool per_node;
|
||||
bool n_per_node;
|
||||
bool no_use_local;
|
||||
bool oversubscribe;
|
||||
};
|
||||
|
@ -115,6 +115,7 @@ struct globals_t {
|
||||
bool no_local_schedule;
|
||||
bool reuse_daemons;
|
||||
int num_procs;
|
||||
int n_per_node;
|
||||
int exit_status;
|
||||
char *hostfile;
|
||||
char *env_val;
|
||||
@ -195,7 +196,10 @@ opal_cmd_line_init_t cmd_line_init[] = {
|
||||
"Whether to allocate/map processes round-robin by slot (the default)" },
|
||||
{ NULL, NULL, NULL, '\0', "pernode", "pernode", 0,
|
||||
&orterun_globals.per_node, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"If no number of process is specified, this will cause one process per available node to be executed" },
|
||||
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
|
||||
{ NULL, NULL, NULL, '\0', "npernode", "npernode", 1,
|
||||
&orterun_globals.n_per_node, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per node on all allocated nodes" },
|
||||
{ NULL, NULL, NULL, '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||
@ -858,8 +862,9 @@ static int init_globals(void)
|
||||
orterun_globals.no_oversubscribe = false;
|
||||
orterun_globals.debugger = false;
|
||||
orterun_globals.no_local_schedule = false;
|
||||
orterun_globals.num_procs = 0;
|
||||
orterun_globals.exit_status = 0;
|
||||
orterun_globals.num_procs = 0;
|
||||
orterun_globals.n_per_node = -1;
|
||||
orterun_globals.exit_status = 0;
|
||||
if( NULL != orterun_globals.hostfile )
|
||||
free( orterun_globals.hostfile );
|
||||
orterun_globals.hostfile = NULL;
|
||||
@ -972,8 +977,8 @@ static int parse_globals(int argc, char* argv[])
|
||||
*/
|
||||
if (orterun_globals.per_node) {
|
||||
id = mca_base_param_reg_int_name("rmaps", "base_pernode",
|
||||
"Request one ppn if num procs not specified",
|
||||
false, false, 0, &ret);
|
||||
"Launch one ppn as directed",
|
||||
false, false, (int)false, &ret);
|
||||
|
||||
if (orterun_globals.per_node) {
|
||||
mca_base_param_set_int(id, (int)true);
|
||||
@ -982,6 +987,13 @@ static int parse_globals(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
/* did the user request "npernode", indicating we are to spawn N ppn */
|
||||
id = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
|
||||
"Launch n procs/node",
|
||||
false, false, -1, &ret);
|
||||
|
||||
mca_base_param_set_int(id, orterun_globals.n_per_node);
|
||||
|
||||
/** Do we want to disallow oversubscription of nodes? */
|
||||
id = mca_base_param_reg_int_name("rmaps", "base_no_oversubscribe",
|
||||
"If nonzero, do not allow oversubscription of processes on nodes. If zero (default), oversubscription is allowed.",
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user