1
1

Correctly handle -host and -hostfile options. Ensure the initial vm launch constrains itself to the union of specified hosts if those options are given. Get oversubscribe set correctly for that case.

This commit was SVN r25648.
Этот коммит содержится в:
Ralph Castain 2011-12-14 20:01:15 +00:00
родитель f7d3234f33
Коммит f531b09a8d
16 изменённых файлов: 264 добавлений и 247 удалений

Просмотреть файл

@ -181,7 +181,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
/* start by setting up the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -47,6 +47,7 @@
#include "orte/mca/iof/iof.h"
#include "orte/mca/ras/ras.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/routed/routed.h"
@ -66,6 +67,7 @@
#include "orte/util/nidmap.h"
#include "orte/util/proc_info.h"
#include "orte/util/regex.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/mca/odls/odls_types.h"
@ -965,17 +967,26 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
return ORTE_SUCCESS;
}
int orte_plm_base_setup_virtual_machine(orte_job_t *daemons)
int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
{
orte_node_t *node;
orte_proc_t *proc;
orte_job_map_t *map=NULL;
int rc, i;
orte_job_t *daemons;
opal_list_t nodes;
opal_list_item_t *item, *next;
orte_app_context_t *app;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_vm",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
if (NULL == daemons->map) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:setup_vm creating map",
@ -994,6 +1005,85 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *daemons)
}
map = daemons->map;
/* run the allocator on the application job - this allows us to
* pickup any host or hostfile arguments so we get the full
* array of nodes in our allocation
*/
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* construct a list of available nodes - don't need ours as
* we already exist
*/
OBJ_CONSTRUCT(&nodes, opal_list_t);
for (i=1; i < orte_node_pool->size; i++) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
/* ignore nodes that are marked as do-not-use for this mapping */
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
/* reset the state so it can be used another time */
node->state = ORTE_NODE_STATE_UP;
continue;
}
if (ORTE_NODE_STATE_DOWN == node->state) {
continue;
}
if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
/* not to be used */
continue;
}
/* retain a copy for our use in case the item gets
* destructed along the way
*/
OBJ_RETAIN(node);
opal_list_append(&nodes, &node->super);
/* by default, mark these as not to be included
* so the filtering logic works correctly
*/
node->mapped = false;
}
}
/* is there a default hostfile? */
if (NULL != orte_default_hostfile) {
/* yes - filter the node list through the file, marking
* any nodes not in the file -or- excluded via ^
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(&nodes, orte_default_hostfile, false))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* filter across the union of all app_context specs */
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, &nodes, false)) &&
rc != ORTE_ERR_TAKE_NEXT_OPTION) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
/* at least one filtering option was executed, so
* remove all nodes that were not mapped
*/
item = opal_list_get_first(&nodes);
while (item != opal_list_get_end(&nodes)) {
next = opal_list_get_next(item);
node = (orte_node_t*)item;
if (!node->mapped) {
opal_list_remove_item(&nodes, item);
OBJ_RELEASE(item);
}
item = next;
}
}
/* zero-out the number of new daemons as we will compute this
* each time we are called
*/
@ -1003,19 +1093,16 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *daemons)
* have a daemon on them - no need to include our own as we are
* obviously already here!
*/
for (i=1; i < orte_node_pool->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
continue;
}
while (NULL != (item = opal_list_remove_first(&nodes))) {
node = (orte_node_t*)item;
/* if this node is already in the map, skip it */
if (NULL != node->daemon) {
OBJ_RELEASE(node);
continue;
}
/* add the node to the map */
opal_pointer_array_add(map->nodes, (void*)node);
++(map->num_nodes);
/* maintain accounting */
OBJ_RETAIN(node);
/* create a new daemon object for this node */
proc = OBJ_NEW(orte_proc_t);
if (NULL == proc) {

Просмотреть файл

@ -177,7 +177,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
/* start by launching the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -165,7 +165,7 @@ static int plm_lsf_launch_job(orte_job_t *jdata)
/* start by setting up the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -996,7 +996,7 @@ static int orte_plm_process_launch(orte_job_t *jdata)
/* start by launching the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -879,7 +879,7 @@ static int rsh_launch(orte_job_t *jdata)
/* setup the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -197,7 +197,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
/* start by setting up the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -169,7 +169,7 @@ static int plm_tm_launch_job(orte_job_t *jdata)
/* start by launching the virtual machine */
daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(daemons))) {
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}

Просмотреть файл

@ -112,6 +112,10 @@ ORTE_DECLSPEC int orte_rmaps_base_close(void);
ORTE_DECLSPEC int orte_rmaps_base_prep_topology(hwloc_topology_t topo);
#endif
ORTE_DECLSPEC int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
opal_list_t *nodes,
bool remove);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -49,6 +49,74 @@
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
opal_list_t *nodes, bool remove)
{
int rc=ORTE_ERR_TAKE_NEXT_OPTION;
/* did the app_context contain a hostfile? */
if (NULL != app->hostfile) {
/* yes - filter the node list through the file, removing
* any nodes not found in the file
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, app->hostfile, remove))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
}
/* did the app_context contain an add-hostfile? */
if (NULL != app->add_hostfile) {
/* yes - filter the node list through the file, removing
* any nodes not found in the file
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, app->add_hostfile, remove))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
}
/* now filter the list through any -host specification */
if (NULL != app->dash_host) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, app->dash_host, remove))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "");
return ORTE_ERR_SILENT;
}
}
/* now filter the list through any add-host specification */
if (NULL != app->add_host) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, app->add_host, remove))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "");
return ORTE_ERR_SILENT;
}
}
return rc;
}
/*
* Query the registry for all nodes allocated to a specified app_context
*/
@ -117,6 +185,10 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
/* not to be used */
continue;
}
/* if this node wasn't included in the vm (e.g., by -host), ignore it */
if (NULL == node->daemon) {
continue;
}
/* retain a copy for our use in case the item gets
* destructed along the way
*/
@ -128,13 +200,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
*/
node->mapped = false;
}
/* quick sanity check */
if (NULL == node->daemon) {
orte_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:missing-daemon",
true, node->name);
return ORTE_ERR_SILENT;
}
if (NULL == nd || nd->daemon->name.vpid < node->daemon->name.vpid) {
/* just append to end */
opal_list_append(allocated_nodes, &node->super);
@ -167,7 +232,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
* any nodes not in the file -or- excluded via ^
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(allocated_nodes,
orte_default_hostfile))) {
orte_default_hostfile,
true))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -179,180 +245,12 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
return ORTE_ERR_SILENT;
}
}
/* did the app_context contain a hostfile? */
if (NULL != app && NULL != app->hostfile) {
/* yes - filter the node list through the file, removing
* any nodes not found in the file
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(allocated_nodes,
app->hostfile))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
}
/* did the app_context contain an add-hostfile? */
if (NULL != app && NULL != app->add_hostfile) {
/* yes - filter the node list through the file, removing
* any nodes not found in the file
*/
if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(allocated_nodes,
app->add_hostfile))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is here */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, app->hostfile);
return ORTE_ERR_SILENT;
}
}
/* now filter the list through any -host specification */
if (NULL != app && NULL != app->dash_host) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes,
app->dash_host))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "");
return ORTE_ERR_SILENT;
}
}
/* now filter the list through any add-host specification */
if (NULL != app && NULL != app->add_host) {
if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(allocated_nodes,
app->add_host))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/** check that anything is left! */
if (0 == opal_list_get_size(allocated_nodes)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
true, app->app, "");
return ORTE_ERR_SILENT;
}
}
/* finally, filter thru any resource constraints */
#if 0
for (item = opal_list_get_first(&app->resource_constraints);
item != opal_list_get_end(&app->resource_constraints);
item = opal_list_get_next(item)) {
req_res = (opal_sysinfo_value_t*)item;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s CHECKING CONSTRAINT %s FOR APP %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
req_res->key, app->app));
/* check against node values */
item2 = opal_list_get_first(allocated_nodes);
while (item2 != opal_list_get_end(allocated_nodes)) {
next = opal_list_get_next(item2);
node = (orte_node_t*)item2;
found = false;
for (item3 = opal_list_get_first(&node->resources);
item3 != opal_list_get_end(&node->resources);
item3 = opal_list_get_next(item3)) {
ninfo = (opal_sysinfo_value_t*)item3;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s COMPARING CONSTRAINT %s WITH RESOURCE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
req_res->key, ninfo->key));
if (0 == strcmp(req_res->key, ninfo->key)) {
if (OPAL_STRING == req_res->type) {
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s CHECKING RESOURCE %s:%s ON NODE %s:%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ninfo->key, req_res->data.str,
node->name, ninfo->data.str));
/* there could be multiple hosts or host-types here */
vals = opal_argv_split(req_res->data.str, ',');
for (i=0; NULL != vals[i]; i++) {
if (0 == strncasecmp(vals[i], ninfo->data.str,
strlen(vals[i]))) {
found = true;
break;
}
}
opal_argv_free(vals);
} else {
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s CHECKING RESOURCE %s:%ld ON NODE %s:%ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ninfo->key, (long)req_res->data.i64,
node->name, (long)ninfo->data.i64));
if (req_res->data.i64 <= ninfo->data.i64) {
found = true;
}
}
break;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s CONSTRAINT RESULTED IN %s NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
found ? "RETAINING" : "REMOVING",
node->name));
if (!found) {
opal_list_remove_item(allocated_nodes, item2);
OBJ_RELEASE(item2);
}
item2 = next;
}
}
#endif
/* if the app is NULL, then we are mapping daemons - so remove
* all nodes that already have a daemon on them
*
* NOTE: it is okay if the final list is empty. It just means
* that there are no new daemons to be launched for the
* virtual machine
*/
if (NULL == app) {
item = opal_list_get_first(allocated_nodes);
while (item != opal_list_get_end(allocated_nodes)) {
/** save the next pointer in case we remove this node */
next = opal_list_get_next(item);
/** already have a daemon? */
node = (orte_node_t*)item;
if (NULL != node->daemon) {
/* if this is the local node, keep it if requested */
if (node->daemon->name.vpid == ORTE_PROC_MY_HNP->vpid &&
!(policy & ORTE_MAPPING_NO_USE_LOCAL)) {
item = next;
continue;
}
opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */
}
/** go on to next item */
item = next;
}
*total_num_slots = 0;
return ORTE_SUCCESS;
/* filter the nodes thru any hostfile and dash-host options */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, allocated_nodes, true))
&& ORTE_ERR_TAKE_NEXT_OPTION != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* remove all nodes that are already at max usage, and
@ -364,7 +262,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
while (item != opal_list_get_end(allocated_nodes)) {
/** save the next pointer in case we remove this node */
next = opal_list_get_next(item);
/** check to see if this node is fully used - remove if so */
node = (orte_node_t*)item;
if (0 != node->slots_max && node->slots_inuse > node->slots_max) {
@ -374,8 +271,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
if (0 == node->slots_alloc) {
/* always allocate at least one */
num_slots++;
} else {
num_slots += node->slots_alloc;
} else if (node->slots_alloc > node->slots_inuse) {
num_slots += node->slots_alloc - node->slots_inuse;
}
}

Просмотреть файл

@ -57,7 +57,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);
/* check to see if we can map all the procs */
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (num_slots < (int)app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -85,7 +85,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
obj = hwloc_get_root_obj(node->topology);
}
#endif
if (node->slots_alloc == node->slots_inuse) {
if (node->slots_alloc <= node->slots_inuse) {
opal_output_verbose(2, orte_rmaps_base.rmaps_output,
"mca:rmaps:rr:slot working node %s is full - skipping",
node->name);
@ -223,7 +223,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
(int)num_slots, (unsigned long)num_procs);
/* quick check to see if we can map all the procs */
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (num_slots < (int)app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -308,7 +308,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (node->slots_alloc == node->slots_inuse) {
if (node->slots_alloc <= node->slots_inuse) {
/* if there are no extras to take, then we can
* ignore this node
*/
@ -428,7 +428,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (num_slots < (int)app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -486,7 +486,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
--nxtra_nodes;
}
}
if (node->slots_alloc == node->slots_inuse) {
if (node->slots_alloc <= node->slots_inuse) {
/* everybody takes at least the extras */
num_procs_to_assign = extra_procs_to_assign;
} else {
@ -576,7 +576,7 @@ static int byobj_span(orte_job_t *jdata,
* do more because we don't know how many total objects exist
* across all the nodes
*/
if (num_slots < (int)(jdata->num_procs + app->num_procs)) {
if (num_slots < (int)app->num_procs) {
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
true, app->num_procs, app->app);
@ -661,7 +661,7 @@ static int byobj_span(orte_job_t *jdata,
* have to track how many procs to "shift" elsewhere
* to make up the difference
*/
if (0 == (node->slots_alloc - node->slots_inuse)) {
if (node->slots_alloc <= node->slots_inuse) {
/* if there are no extras to take, then we can
* safely remove this node as we don't need it
*/

Просмотреть файл

@ -845,16 +845,8 @@ int orterun(int argc, char *argv[])
*/
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* run the allocator on the application job - this allows us to
* pickup any host or hostfile arguments so we get the full
* array of nodes in our allocation
*/
if (ORTE_SUCCESS != (rc = orte_ras.allocate(jdata))) {
goto DONE;
}
/* Spawn the job */
/* spawn the job and its daemons */
rc = orte_plm.spawn(jdata);
/* now wait until the termination event fires */

Просмотреть файл

@ -231,7 +231,8 @@ cleanup:
}
int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
char** host_argv)
char** host_argv,
bool remove)
{
opal_list_item_t* item;
bool found;
@ -260,11 +261,7 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
return ORTE_SUCCESS;
}
/* we found some info - filter what is on the list...
* i.e., go through the list and remove any nodes that
* were -not- included on the -host list.
*
* NOTE: The following logic is based on knowing that
/* NOTE: The following logic is based on knowing that
* any node can only be included on the incoming
* nodes list ONCE.
*/
@ -308,10 +305,15 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
goto skipnode;
}
}
/* remove item from list */
opal_list_remove_item(nodes, item);
/* xfer to keep list */
opal_list_append(&keep, item);
if (remove) {
/* remove item from list */
opal_list_remove_item(nodes, item);
/* xfer to keep list */
opal_list_append(&keep, item);
} else {
/* mark the node as found */
node->mapped = true;
}
--num_empty;
}
skipnode:
@ -334,10 +336,15 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
if ((0 == strcmp(node->name, mapped_nodes[i]) ||
(0 == strcmp(node->name, orte_process_info.nodename) &&
(0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i]))))) {
/* remove item from list */
opal_list_remove_item(nodes, item);
/* xfer to keep list */
opal_list_append(&keep, item);
if (remove) {
/* remove item from list */
opal_list_remove_item(nodes, item);
/* xfer to keep list */
opal_list_append(&keep, item);
} else {
/* mark the node as found */
node->mapped = true;
}
break;
}
item = next;
@ -358,6 +365,12 @@ int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
}
}
if (!remove) {
/* all done */
rc = ORTE_SUCCESS;
goto cleanup;
}
/* clear the rest of the nodes list */
while (NULL != (item = opal_list_remove_first(nodes))) {
OBJ_RELEASE(item);

Просмотреть файл

@ -33,7 +33,8 @@ ORTE_DECLSPEC int orte_util_add_dash_host_nodes(opal_list_t *nodes,
char ** host_argv);
ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
char ** host_argv);
char ** host_argv,
bool remove);
ORTE_DECLSPEC int orte_util_get_ordered_dash_host_list(opal_list_t *nodes,
char ** host_argv);

Просмотреть файл

@ -535,7 +535,8 @@ cleanup:
* are not found in the hostfile
*/
int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
char *hostfile)
char *hostfile,
bool remove)
{
opal_list_t newnodes, exclude;
opal_list_item_t *item1, *item2, *next, *item3;
@ -577,7 +578,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
OBJ_RELEASE(item1);
}
/* now check our nodes and keep those that match. We can
/* now check our nodes and keep or mark those that match. We can
* destruct our hostfile list as we go since this won't be needed
*/
OBJ_CONSTRUCT(&keep, opal_list_t);
@ -623,10 +624,15 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
goto skipnode;
}
}
/* remove item from list */
opal_list_remove_item(nodes, item1);
/* xfer to keep list */
opal_list_append(&keep, item1);
if (remove) {
/* remove item from list */
opal_list_remove_item(nodes, item1);
/* xfer to keep list */
opal_list_append(&keep, item1);
} else {
/* mark as included */
node_from_list->mapped = true;
}
--num_empty;
}
skipnode:
@ -658,10 +664,15 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
item1 = opal_list_get_next(nodes)) {
node_from_list = (orte_node_t*)item1;
if (0 == strcmp(node_from_list->name, node_from_pool->name)) {
/* match - remove item from list */
opal_list_remove_item(nodes, item1);
/* xfer to keep list */
opal_list_append(&keep, item1);
if (remove) {
/* match - remove item from list */
opal_list_remove_item(nodes, item1);
/* xfer to keep list */
opal_list_append(&keep, item1);
} else {
/* mark as included */
node_from_list->mapped = true;
}
break;
}
}
@ -696,10 +707,15 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
if (node_from_file->slots < node_from_list->slots) {
node_from_list->slots_alloc = node_from_file->slots;
}
/* remove the node from the list */
opal_list_remove_item(nodes, item1);
/* xfer it to keep list */
opal_list_append(&keep, item1);
if (remove) {
/* remove the node from the list */
opal_list_remove_item(nodes, item1);
/* xfer it to keep list */
opal_list_append(&keep, item1);
} else {
/* mark as included */
node_from_list->mapped = true;
}
break;
}
}
@ -722,6 +738,12 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
return ORTE_ERR_SILENT;
}
if (!remove) {
/* all done */
OBJ_DESTRUCT(&newnodes);
return ORTE_SUCCESS;
}
/* clear the rest of the nodes list */
while (NULL != (item1 = opal_list_remove_first(nodes))) {
OBJ_RELEASE(item1);

Просмотреть файл

@ -34,7 +34,8 @@ ORTE_DECLSPEC int orte_util_add_hostfile_nodes(opal_list_t *nodes,
char *hostfile);
ORTE_DECLSPEC int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
char *hostfile);
char *hostfile,
bool remove);
ORTE_DECLSPEC int orte_util_get_ordered_host_list(opal_list_t *nodes,
char *hostfile);