1
1

Issue an error message and abort if the user requests a number of processes that conflicts with nperxxx directives when evaluated against available resources

This commit was SVN r21949.
Этот коммит содержится в:
Ralph Castain 2009-09-07 03:36:10 +00:00
родитель ca09e8f604
Коммит 142036f2c0
2 изменённых файлов: 99 добавлений и 20 удалений

Просмотреть файл

@ -60,4 +60,18 @@ to spawn too many daemons and will be aborted.
This may be resolved by increasing the number of available ranks by
re-configuring with the --enable-jumbo-apps option, and then
re-building the application.
#
[rmaps:too-many-procs]
Your job has requested a conflicting number of processes for the
application:
App: %s
number of procs: %d
This is more processes than we can launch under the following
additional directives and conditions:
%s: %d
%s: %d
Please revise the conflict and try again.

Просмотреть файл

@ -101,11 +101,13 @@ static int npernode(orte_job_t *jdata)
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int total_procs=0, np;
int total_procs=0, np, nprocs;
int num_nodes;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
total_procs = 0;
/* loop through the app_contexts */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
@ -117,7 +119,6 @@ static int npernode(orte_job_t *jdata)
} else {
np = INT_MAX;
}
total_procs = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
@ -128,10 +129,12 @@ static int npernode(orte_job_t *jdata)
goto error;
}
/* loop through the list of nodes */
num_nodes = opal_list_get_size(&node_list);
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
for (j=0; j < orte_rmaps_base.npernode && total_procs < np; j++) {
for (j=0; j < orte_rmaps_base.npernode && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
@ -147,9 +150,22 @@ static int npernode(orte_job_t *jdata)
}
}
total_procs++;
nprocs++;
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of nodes", num_nodes,
"npernode", orte_rmaps_base.npernode);
return ORTE_ERR_SILENT;
}
}
jdata->num_procs = total_procs;
@ -169,10 +185,12 @@ static int nperboard(orte_job_t *jdata)
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int total_procs=0, np;
int total_procs=0, np, nprocs;
int num_boards;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
total_procs = 0;
/* loop through the app_contexts */
for(i=0; i < jdata->apps->size; i++) {
@ -185,7 +203,6 @@ static int nperboard(orte_job_t *jdata)
} else {
np = INT_MAX;
}
total_procs = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
@ -196,12 +213,14 @@ static int nperboard(orte_job_t *jdata)
goto error;
}
/* loop through the list of nodes */
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
num_boards = node->boards;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && total_procs < np; k++) {
for (k=0; k < node->boards && nprocs < np; k++) {
/* put the specified number of procs on each board */
for (j=0; j < orte_rmaps_base.nperboard && total_procs < np; j++) {
for (j=0; j < orte_rmaps_base.nperboard && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
@ -217,10 +236,23 @@ static int nperboard(orte_job_t *jdata)
}
}
total_procs++;
nprocs++;
}
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of boards", num_boards,
"nperboard", orte_rmaps_base.nperboard);
return ORTE_ERR_SILENT;
}
}
jdata->num_procs = total_procs;
@ -241,11 +273,13 @@ static int npersocket(orte_job_t *jdata)
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int total_procs=0, np;
int total_procs=0, np, nprocs;
int num_sockets;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
total_procs = 0;
/* loop through the app_contexts */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
@ -257,7 +291,6 @@ static int npersocket(orte_job_t *jdata)
} else {
np = INT_MAX;
}
total_procs = 0;
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
@ -268,12 +301,14 @@ static int npersocket(orte_job_t *jdata)
goto error;
}
/* loop through the list of nodes */
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
num_sockets = node->sockets_per_board;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && total_procs < np; k++) {
for (k=0; k < node->boards && nprocs < np; k++) {
/* loop through the number of sockets/board */
for (n=0; n < node->sockets_per_board && total_procs < np; n++) {
for (n=0; n < node->sockets_per_board && nprocs < np; n++) {
/* put the specified number of procs on each socket */
for (j=0; j < orte_rmaps_base.npersocket && total_procs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
@ -292,11 +327,24 @@ static int npersocket(orte_job_t *jdata)
}
/* track the number of procs */
total_procs++;
nprocs++;
}
}
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of sockets", num_sockets,
"npersocket", orte_rmaps_base.npersocket);
return ORTE_ERR_SILENT;
}
}
jdata->num_procs = total_procs;
@ -319,13 +367,14 @@ static int loadbalance(orte_job_t *jdata)
int i, j;
opal_list_t node_list;
orte_std_cntr_t num_nodes, num_slots;
int rc=ORTE_SUCCESS, total_procs=0;
int rc=ORTE_SUCCESS, total_procs, np, nprocs;
int ppn = 0;
opal_list_item_t *item, *start;
orte_node_t *node;
/* setup */
OBJ_CONSTRUCT(&node_list, opal_list_t);
total_procs = 0;
/* compute total #procs we are going to add and the total number of nodes available */
for(i=0; i < jdata->apps->size; i++) {
@ -338,20 +387,22 @@ static int loadbalance(orte_job_t *jdata)
ORTE_ERROR_LOG(rc);
goto error;
}
if (0 == app->num_procs) {
if (0 < app->num_procs) {
np = app->num_procs;
} else {
/* set the num_procs to the #slots */
app->num_procs = num_slots;
np = num_slots;
}
num_nodes = opal_list_get_size(&node_list);
/* compute the base ppn */
ppn = app->num_procs / num_nodes;
ppn = np / num_nodes;
/* if a bookmark exists from some prior mapping, set us to start there */
start = orte_rmaps_base_get_starting_point(&node_list, jdata);
/* loop through the list of nodes until we either assign all the procs
* or return to the starting point
*/
total_procs = 0;
item = start;
nprocs = 0;
do {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
@ -370,6 +421,7 @@ static int loadbalance(orte_job_t *jdata)
}
}
total_procs++;
nprocs++;
}
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
@ -378,7 +430,7 @@ static int loadbalance(orte_job_t *jdata)
else {
item = opal_list_get_next(item);
}
} while (item != start);
} while (item != start && nprocs < np);
/* save the bookmark */
jdata->bookmark = node;
@ -387,7 +439,7 @@ static int loadbalance(orte_job_t *jdata)
* again, assigning 1 per node until all are assigned
*/
item = start;
while (total_procs < app->num_procs) {
while (nprocs < np) {
node = (orte_node_t*)item;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
@ -400,6 +452,7 @@ static int loadbalance(orte_job_t *jdata)
}
}
total_procs++;
nprocs++;
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
item = opal_list_get_first(&node_list);
@ -415,6 +468,18 @@ static int loadbalance(orte_job_t *jdata)
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of slots", nprocs,
"number of nodes", num_nodes);
return ORTE_ERR_SILENT;
}
}
/* record the number of procs */
jdata->num_procs = total_procs;