Modify the pernode option so it can accept a request for the number of processes to be launched. We now check three use-cases for pernode:
1. no -np provided - put one proc/node across all allocated nodes 2. -np N provided, N > #nodes - we print a pretty error message and exit 3. -np N provided, N <= #nodes - put one proc/node across N nodes I also added a new orte constant (ORTE_ERR_SILENT) that allows us to pass up the chain that an error was encountered, but NOT print ORTE_ERROR_LOG messages. This is intended to be used for cases where the error we encounter is NOT an orte error, but rather is one associated with incorrect user input (e.g., the preceding case 2). In such cases, there is no point in printing an ORTE_ERROR_LOG chain of messages as it isn't an orte error. This commit was SVN r12821.
Этот коммит содержится в:
родитель
0a5d41857a
Коммит
8314e8dbb9
@ -92,7 +92,8 @@ enum {
|
||||
ORTE_ERR_PROC_EXIT_STATUS_MISSING = (ORTE_ERR_BASE - 24),
|
||||
ORTE_ERR_INDETERMINATE_STATE_INFO = (ORTE_ERR_BASE - 25),
|
||||
ORTE_ERR_NODE_FULLY_USED = (ORTE_ERR_BASE - 26),
|
||||
ORTE_ERR_INVALID_NUM_PROCS = (ORTE_ERR_BASE - 27)
|
||||
ORTE_ERR_INVALID_NUM_PROCS = (ORTE_ERR_BASE - 27),
|
||||
ORTE_ERR_SILENT = (ORTE_ERR_BASE - 28)
|
||||
};
|
||||
|
||||
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)
|
||||
|
@ -37,6 +37,11 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
{
|
||||
OPAL_TRACE(1);
|
||||
|
||||
if (ORTE_ERR_SILENT == error_code) {
|
||||
/* if the error is silent, say nothing */
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL == orte_process_info.my_name) {
|
||||
opal_output(0, "[NO-NAME] ORTE_ERROR_LOG: %s in file %s at line %d",
|
||||
ORTE_ERROR_NAME(error_code), filename, line);
|
||||
|
@ -29,4 +29,10 @@ for use.
|
||||
RMAPS found multiple applications to be launched, with
|
||||
at least one that failed to specify the number of processes to execute.
|
||||
When specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
of each to launch via the -np argument.
|
||||
|
||||
[orte-rmaps-rr:per-node-and-too-many-procs]
|
||||
There are not enough nodes in your allocation to satisfy your request to launch
|
||||
%d processes on a per-node basis - only %d nodes were available.
|
||||
|
||||
Either request fewer processes, or obtain a larger allocation.
|
||||
|
@ -284,6 +284,8 @@ static int orte_rmaps_rr_process_attrs(opal_list_t *attributes)
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attributes, ORTE_RMAPS_PERNODE))) {
|
||||
/* was provided - set boolean accordingly */
|
||||
mca_rmaps_round_robin_component.per_node = true;
|
||||
/* indicate that we are going to map this job bynode */
|
||||
mca_rmaps_round_robin_component.bynode = true;
|
||||
}
|
||||
|
||||
mca_rmaps_round_robin_component.no_use_local = false;
|
||||
@ -314,10 +316,9 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
orte_ras_node_t *node, *node2;
|
||||
char *save_bookmark;
|
||||
orte_vpid_t vpid_start, job_vpid_start=0;
|
||||
orte_std_cntr_t num_procs = 0, total_num_slots, mapped_num_slots;
|
||||
orte_std_cntr_t num_procs = 0, total_num_slots, mapped_num_slots, num_nodes, num_slots;
|
||||
int rc;
|
||||
bool modify_app_context = false;
|
||||
bool nprocs_not_specified;
|
||||
char *sptr;
|
||||
orte_attribute_t *attr;
|
||||
|
||||
@ -436,20 +437,8 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
/* Set cur_node_item to point to the first node in the specified list to be used */
|
||||
cur_node_item = opal_list_get_first(working_node_list);
|
||||
|
||||
if (0 == app->num_procs) {
|
||||
nprocs_not_specified = true;
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes - if
|
||||
user has specified "-pernode", then set it to the number of nodes
|
||||
*/
|
||||
if (mca_rmaps_round_robin_component.per_node) {
|
||||
app->num_procs = (orte_std_cntr_t)opal_list_get_size(&mapped_node_list);
|
||||
} else {
|
||||
app->num_procs = (orte_std_cntr_t)mapped_num_slots;
|
||||
}
|
||||
modify_app_context = true;
|
||||
} else {
|
||||
nprocs_not_specified = false;
|
||||
}
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&mapped_node_list);
|
||||
num_slots = (orte_std_cntr_t)mapped_num_slots;
|
||||
}
|
||||
else {
|
||||
/** no mapping was specified, so we are going to just use everything that was
|
||||
@ -459,20 +448,36 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
*/
|
||||
working_node_list = &master_node_list;
|
||||
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&master_node_list);
|
||||
num_slots = total_num_slots;
|
||||
}
|
||||
|
||||
if (mca_rmaps_round_robin_component.per_node) {
|
||||
/* there are three use-cases that we need to deal with:
|
||||
* (a) if -np was not provided, then we just use the number of nodes
|
||||
* (b) if -np was provided AND #procs > #nodes, then error out
|
||||
* (c) if -np was provided AND #procs <= #nodes, then launch
|
||||
* the specified #procs one/node. In this case, we just
|
||||
* leave app->num_procs alone
|
||||
*/
|
||||
if (0 == app->num_procs) {
|
||||
nprocs_not_specified = true;
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes - if
|
||||
user has specified "-pernode", then set it to the number of nodes
|
||||
*/
|
||||
if (mca_rmaps_round_robin_component.per_node) {
|
||||
app->num_procs = (orte_std_cntr_t)opal_list_get_size(&master_node_list);
|
||||
} else {
|
||||
app->num_procs = total_num_slots;
|
||||
}
|
||||
app->num_procs = num_nodes;
|
||||
modify_app_context = true;
|
||||
} else {
|
||||
nprocs_not_specified = false;
|
||||
} else if (app->num_procs > num_nodes) {
|
||||
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs",
|
||||
true, app->num_procs, num_nodes, NULL);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
} else if (0 == app->num_procs) {
|
||||
/** set the num_procs to equal the number of slots on these mapped nodes - if
|
||||
user has specified "-bynode", then set it to the number of nodes
|
||||
*/
|
||||
if (mca_rmaps_round_robin_component.bynode) {
|
||||
app->num_procs = num_nodes;
|
||||
} else {
|
||||
app->num_procs = num_slots;
|
||||
}
|
||||
modify_app_context = true;
|
||||
}
|
||||
|
||||
/* allocate a vpid range for this app within the job */
|
||||
@ -491,10 +496,7 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid, opal_list_t *attributes)
|
||||
num_procs += app->num_procs;
|
||||
|
||||
/* Make assignments */
|
||||
/* if the number of procs was not specified, and we want to map pernode,
|
||||
* then we need to do the bynode mapping */
|
||||
if (mca_rmaps_round_robin_component.bynode ||
|
||||
(nprocs_not_specified && mca_rmaps_round_robin_component.per_node)) {
|
||||
if (mca_rmaps_round_robin_component.bynode) {
|
||||
rc = map_app_by_node(app, map, jobid, vpid_start, working_node_list, &max_used_nodes);
|
||||
} else {
|
||||
rc = map_app_by_slot(app, map, jobid, vpid_start, working_node_list, &max_used_nodes);
|
||||
|
@ -109,6 +109,9 @@ orte_err2str(int errnum)
|
||||
case ORTE_ERR_INVALID_NUM_PROCS:
|
||||
retval = "Multiple applications were specified, but at least one failed to specify the number of processes to run";
|
||||
break;
|
||||
case ORTE_ERR_SILENT:
|
||||
retval = NULL;
|
||||
break;
|
||||
default:
|
||||
retval = NULL;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user