1
1

Implement the seq rmaps module that sequentially maps process ranks to a list hosts in a hostfile.

Restore the "do-not-launch" functionality so users can test a mapping without launching it.

Add a "do-not-resolve" cmd line flag to mpirun so the opal/util/if.c code does not attempt to resolve network addresses, thus enabling a user to test a hostfile mapping without hanging on network resolve requests.

Add a function to hostfile to generate an ordered list of host names from a hostfile

This commit was SVN r18190.
Этот коммит содержится в:
Ralph Castain 2008-04-17 13:50:59 +00:00
родитель eb94fa48ce
Коммит e7487ad533
13 изменённых файлов: 226 добавлений и 247 удалений

Просмотреть файл

@ -72,6 +72,7 @@
#include "opal/util/output.h"
#include "opal/util/strncpy.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_param.h"
#ifdef HAVE_STRUCT_SOCKADDR_IN
@ -117,6 +118,7 @@ typedef struct opal_if_t opal_if_t;
static opal_list_t opal_if_list;
static bool already_done = false;
static bool do_not_resolve = false;
#define DEFAULT_NUMBER_INTERFACES 10
#define MAX_IFCONF_SIZE 10 * 1024 * 1024
@ -160,6 +162,11 @@ static int opal_ifinit(void)
}
already_done = true;
mca_base_param_reg_int_name("opal", "if_do_not_resolve",
"If nonzero, do not attempt to resolve interfaces",
false, false, (int)false, &sd);
do_not_resolve = OPAL_INT_TO_BOOL(sd);
/* create the internet socket to test off */
/*
Change AF_INET to AF_UNSPEC (or AF_INET6) and everything will fail.
@ -934,6 +941,14 @@ int opal_ifaddrtoname(const char* if_addr, char* if_name, int length)
return rc;
}
/* if the user asked us not to resolve interfaces, then just return */
if (do_not_resolve) {
/* return not found so ifislocal will declare
* the node to be non-local
*/
return OPAL_ERR_NOT_FOUND;
}
#if OPAL_WANT_IPV6
memset(&hints, 0, sizeof(hints));
hints.ai_family = PF_UNSPEC;

Просмотреть файл

@ -79,6 +79,12 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
return rc;
}
/* if we don't want to launch, now is the time to leave */
if (orte_do_not_launch) {
orte_finalize();
exit(0);
}
/*
* setup I/O forwarding
*/

Просмотреть файл

@ -58,7 +58,10 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
/* create a working list of nodes */
nodes = (orte_node_t**)orte_node_pool->addr;
for (i=0; i < orte_node_pool->size; i++) {
if (NULL != nodes[i] && nodes[i]->allocate) {
if (NULL == nodes[i]) {
break; /* nodes are left aligned, so stop when we hit a null */
}
if (nodes[i]->allocate) {
/* retain a copy for our use in case the item gets
* destructed along the way
*/
@ -256,7 +259,8 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
orte_vpid_t vpid,
orte_std_cntr_t app_idx,
opal_list_t *nodes,
bool oversubscribe)
bool oversubscribe,
bool remove_from_list)
{
orte_proc_t *proc;
bool oversub;
@ -324,11 +328,13 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
if ((0 != current_node->slots_max &&
current_node->slots_inuse >= current_node->slots_max) ||
(!oversubscribe && current_node->slots_inuse >= current_node->slots)) {
opal_list_remove_item(nodes, (opal_list_item_t*)current_node);
/* release it - it was retained when we started, so this
* just ensures the instance counter is correctly updated
*/
OBJ_RELEASE(current_node);
if (remove_from_list) {
opal_list_remove_item(nodes, (opal_list_item_t*)current_node);
/* release it - it was retained when we started, so this
* just ensures the instance counter is correctly updated
*/
OBJ_RELEASE(current_node);
}
/** now return the proper code so the caller knows we removed the node! */
return ORTE_ERR_NODE_FULLY_USED;
}

Просмотреть файл

@ -76,7 +76,8 @@ ORTE_DECLSPEC int orte_rmaps_base_claim_slot(orte_job_t *jdata,
orte_vpid_t vpid,
orte_std_cntr_t app_idx,
opal_list_t *nodes,
bool oversubscribe);
bool oversubscribe,
bool remove_from_list);
ORTE_DECLSPEC int orte_rmaps_base_compute_usage(orte_job_t *jdata);

Просмотреть файл

@ -115,7 +115,7 @@ static int map_app_by_user_map(
rankmap[orte_rmaps_rank_file_num_alloc + vpid_start].node_name,rankmap[orte_rmaps_rank_file_num_alloc+vpid_start].rank, node->slot_list);
}
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rankmap[orte_rmaps_rank_file_num_alloc+vpid_start].rank, app->idx,
nodes, jdata->map->oversubscribe))) {
nodes, jdata->map->oversubscribe, true))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -200,7 +200,7 @@ static int map_app_by_node(
}
}
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + orte_rmaps_rank_file_num_alloc, app->idx,
nodes, jdata->map->oversubscribe))) {
nodes, jdata->map->oversubscribe, true))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -310,7 +310,7 @@ static int map_app_by_slot(
}
}
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + orte_rmaps_rank_file_num_alloc, app->idx,
nodes, jdata->map->oversubscribe))) {
nodes, jdata->map->oversubscribe, true))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report

Просмотреть файл

@ -47,56 +47,6 @@
*/
static opal_list_item_t *cur_node_item = NULL;
static int map_app_by_user_map(
orte_app_context_t* app,
orte_job_t* jdata,
orte_vpid_t vpid_start,
opal_list_t* nodes,
opal_list_t* procs)
{
#if 0
int rc = ORTE_SUCCESS;
orte_ras_proc_t *proc;
opal_list_item_t *proc_item, *node_item;
orte_node_t *node;
for (proc_item = opal_list_get_first(procs);
proc_item != opal_list_get_end(procs);
proc_item = opal_list_get_next(proc_item)) {
proc = (orte_ras_proc_t *)proc_item;
if(proc->rank >= vpid_start && proc->rank < (vpid_start + app->num_procs)){
for (node_item = opal_list_get_first(nodes);
node_item != opal_list_get_end(nodes);
node_item = opal_list_get_next(node_item)) {
node = (orte_node_t *)node_item;
if(0 == strcmp(node->name, proc->node_name)){
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(map, node, jobid, proc->rank, app->idx,
nodes, false))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
* an error
*/
if (ORTE_ERR_NODE_FULLY_USED != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
break;
}
}
}
}
#endif
return ORTE_SUCCESS;
}
/*
* Create a default mapping for the application, scheduling round
* robin by node.
@ -156,7 +106,7 @@ static int map_app_by_node(
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
nodes, jdata->map->oversubscribe))) {
nodes, jdata->map->oversubscribe, true))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -263,7 +213,7 @@ static int map_app_by_slot(
for( i = 0; i < num_slots_to_take; ++i) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
nodes, jdata->map->oversubscribe))) {
nodes, jdata->map->oversubscribe, true))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
* really isn't an error - we just need to break from the loop
* since the node is fully used up. For now, just don't report
@ -309,7 +259,7 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
orte_job_map_t *map;
orte_app_context_t *app, **apps;
orte_std_cntr_t i;
opal_list_t node_list, procs;
opal_list_t node_list;
opal_list_item_t *item;
orte_node_t *node;
orte_vpid_t vpid_start;
@ -443,7 +393,8 @@ static int orte_rmaps_rr_map(orte_job_t *jdata)
/* Make assignments */
if (map->policy == ORTE_RMAPS_BYUSER) {
rc = map_app_by_user_map(app, jdata, vpid_start, &node_list, &procs);
rc = ORTE_ERR_NOT_IMPLEMENTED;
goto error;
} else if (map->policy == ORTE_RMAPS_BYNODE) {
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
} else {

Просмотреть файл

@ -36,11 +36,20 @@
#include "opal/util/argv.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_seq.h"
static int orte_rmaps_seq_map(orte_job_t *jdata);
/* define the module */
orte_rmaps_base_module_t orte_rmaps_seq_module = {
orte_rmaps_seq_map
};
/*
@ -49,58 +58,68 @@
*/
static int orte_rmaps_seq_map(orte_job_t *jdata)
{
#if 0
orte_job_map_t *map;
orte_app_context_t *app, **apps;
orte_std_cntr_t i;
opal_list_t node_list, procs;
opal_list_item_t *item;
orte_node_t *node;
orte_vpid_t vpid_start;
orte_std_cntr_t num_nodes, num_slots;
orte_std_cntr_t i, j;
opal_list_item_t *item, *next, *cur_node_item;
orte_node_t *node, *nd, **nodes;
orte_vpid_t vpid;
orte_std_cntr_t num_nodes;
int rc;
orte_std_cntr_t slots_per_node;
opal_list_t *default_node_list=NULL;
opal_list_t *node_list=NULL;
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
"%s rmaps:seq mapping job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
OPAL_TRACE(1);
/* conveniece def */
map = jdata->map;
apps = (orte_app_context_t**)jdata->apps->addr;
nodes = (orte_node_t**)orte_node_pool->addr;
/* if there is a default hostfile, go and get its ordered list of nodes */
if (NULL != orte_default_hostfile) {
default_node_list = OBJ_NEW(opal_list_t);
if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(default_node_list, orte_default_hostfile))) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
/* start at the beginning... */
vpid_start = 0;
vpid = 0;
/* cycle through the app_contexts, mapping them sequentially */
for(i=0; i < jdata->num_apps; i++) {
app = apps[i];
/* if the number of processes wasn't specified, then we know there can be only
* one app_context allowed in the launch, and that we are to launch it across
* all available slots. We'll double-check the single app_context rule first
/* for each app_context, if a hostfile was specified, then we let it
* override what we may have obtained from the default hostfile
*/
if (0 == app->num_procs && 1 < jdata->num_apps) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np",
true, jdata->num_apps, NULL);
rc = ORTE_ERR_SILENT;
goto error;
if (NULL != app->hostfile) {
node_list = OBJ_NEW(opal_list_t);
if (ORTE_SUCCESS != (rc = orte_util_get_ordered_host_list(node_list, app->hostfile))) {
ORTE_ERROR_LOG(rc);
goto error;
}
} else {
node_list = default_node_list;
}
/* for each app_context, we have to get an ordered list of nodes
* from the specified hostfile
*/
OBJ_CONSTRUCT(&node_list, opal_list_t);
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
if (NULL == node_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(node_list))) {
opal_show_help("help-orte-rmaps-base.txt",
"orte-rmaps-base:no-available-resources",
true);
return ORTE_ERR_SILENT;
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
/* if a bookmark exists from some prior mapping, set us to start there */
if (NULL != jdata->bookmark) {
cur_node_item = NULL;
/* find this node on the list */
for (item = opal_list_get_first(&node_list);
item != opal_list_get_end(&node_list);
for (item = opal_list_get_first(node_list);
item != opal_list_get_end(node_list);
item = opal_list_get_next(item)) {
node = (orte_node_t*)item;
@ -111,105 +130,97 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
}
/* see if we found it - if not, just start at the beginning */
if (NULL == cur_node_item) {
cur_node_item = opal_list_get_first(&node_list);
cur_node_item = opal_list_get_first(node_list);
}
} else {
/* if no bookmark, then just start at the beginning of the list */
cur_node_item = opal_list_get_first(&node_list);
cur_node_item = opal_list_get_first(node_list);
}
if (map->pernode && map->npernode == 1) {
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the number of nodes
* (b) if -np was provided AND #procs > #nodes, then error out
* (c) if -np was provided AND #procs <= #nodes, then launch
* the specified #procs one/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
app->num_procs = num_nodes;
} else if (app->num_procs > num_nodes) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:per-node-and-too-many-procs",
true, app->num_procs, num_nodes, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (map->pernode && map->npernode > 1) {
/* first, let's check to see if there are enough slots/node to
* meet the request - error out if not
/* if num_procs wasn't specified, set it now */
if (0 == app->num_procs) {
app->num_procs = num_nodes;
}
for (i=0; i < app->num_procs; i++) {
/* see if any nodes remain unused and available. We need to do this check
* each time since we may remove nodes from the list (as they become fully
* used) as we cycle through the loop
*/
slots_per_node = num_slots / num_nodes;
if (map->npernode > slots_per_node) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-not-enough-slots",
true, map->npernode, slots_per_node, NULL);
if(0 >= opal_list_get_size(node_list) ) {
/* Everything is at max usage! :( */
opal_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:alloc-error",
true, app->num_procs, app->app);
return ORTE_ERR_SILENT;
}
/* Save the next node we can use before claiming slots, since
* we may need to prune the nodes list removing overused nodes.
* Wrap around to beginning if we are at the end of the list
*/
if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
next = opal_list_get_first(node_list);
}
else {
next = opal_list_get_next(cur_node_item);
}
/* find this node on the global array - this is necessary so
* that our mapping gets saved on that array as the objects
* returned by the hostfile function are -not- on the array
*/
node = NULL;
nd = (orte_node_t*)cur_node_item;
for (j=0; j < orte_node_pool->size; j++) {
if (NULL == nodes[j]) {
break; /* nodes are left aligned, so stop when we hit a null */
}
if (nodes[j]->allocate && 0 == strcmp(nd->name, nodes[j]->name)) {
node = nodes[j];
break;
}
}
if (NULL == node) {
/* wasn't found - that is an error */
opal_show_help("help-orte-rmaps-seq.txt",
"orte-rmaps-seq:resource-not-found",
true);
rc = ORTE_ERR_SILENT;
goto error;
}
/* there are three use-cases that we need to deal with:
* (a) if -np was not provided, then we just use the n/node * #nodes
* (b) if -np was provided AND #procs > (n/node * #nodes), then error out
* (c) if -np was provided AND #procs <= (n/node * #nodes), then launch
* the specified #procs n/node. In this case, we just
* leave app->num_procs alone
*/
if (0 == app->num_procs) {
/* set the num_procs to equal the specified num/node * the number of nodes */
app->num_procs = map->npernode * num_nodes;
} else if (app->num_procs > (map->npernode * num_nodes)) {
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:n-per-node-and-too-many-procs",
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
rc = ORTE_ERR_SILENT;
goto error;
}
} else if (0 == app->num_procs) {
/** set the num_procs to equal the number of slots on these mapped nodes - if
user has specified "-bynode", then set it to the number of nodes
*/
if (map->policy == ORTE_RMAPS_BYNODE) {
app->num_procs = num_nodes;
} else if (map->policy == ORTE_RMAPS_BYSLOT) {
app->num_procs = num_slots;
} else if (map->policy == ORTE_RMAPS_BYUSER) {
/* we can't handle this - it should have been set when we got
* the map info. If it wasn't, then we can only error out
*/
opal_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:no-np-and-user-map",
true, app->num_procs, map->npernode, num_nodes, num_slots, NULL);
rc = ORTE_ERR_SILENT;
goto error;
/* assign next vpid to this node - do NOT allow claim_slot to remove
* an oversubscribed node from the list!
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
vpid, app->idx,
node_list,
jdata->map->oversubscribe,
false))) {
if (ORTE_ERR_NODE_FULLY_USED != rc) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
/* increment the vpid */
vpid++;
/* move to next node */
cur_node_item = next;
}
/** track the total number of processes we mapped */
jdata->num_procs += app->num_procs;
/* Make assignments */
if (map->policy == ORTE_RMAPS_BYUSER) {
rc = map_app_by_user_map(app, jdata, vpid_start, &node_list, &procs);
} else if (map->policy == ORTE_RMAPS_BYNODE) {
rc = map_app_by_node(app, jdata, vpid_start, &node_list);
} else {
rc = map_app_by_slot(app, jdata, vpid_start, &node_list);
}
/* update the starting vpid for the next app_context */
vpid_start += app->num_procs;
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* save the bookmark */
/* update the bookmark */
jdata->bookmark = (orte_node_t*)cur_node_item;
/* cleanup the node list - it can differ from one app_context
* to another, so we have to get it every time
*/
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
/* cleanup the node list if it came from this app_context */
if (node_list != default_node_list) {
while(NULL != (item = opal_list_remove_first(node_list))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(node_list);
}
OBJ_DESTRUCT(&node_list);
}
/* compute and save convenience values */
@ -226,27 +237,20 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
return ORTE_SUCCESS;
#if 0
if(!opal_list_is_empty(&procs)){
if(ORTE_SUCCESS != (rc = orte_rmaps_base_rearrange_map(app, map, &procs))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
}
#endif
error:
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
if (NULL != default_node_list) {
while (NULL != (item = opal_list_remove_first(default_node_list))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(default_node_list);
}
OBJ_DESTRUCT(&node_list);
if (NULL != node_list) {
while (NULL != (item = opal_list_remove_first(node_list))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(node_list);
}
return rc;
#endif
return ORTE_SUCCESS;
}
orte_rmaps_base_module_t orte_rmaps_seq_module = {
orte_rmaps_seq_map
};

Просмотреть файл

@ -47,6 +47,7 @@ bool orte_timing;
bool orte_debug_flag = false;
bool orte_debug_daemons_flag = false;
bool orte_debug_daemons_file_flag = false;
bool orte_do_not_launch = false;
bool orted_spin_flag = false;
bool orte_static_ports = false;
bool orte_keep_fqdn_hostnames = false;
@ -127,6 +128,11 @@ int orte_register_params(void)
}
}
mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value);
orte_do_not_launch = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orted", "spin",
"Have any orteds spin until we can connect a debugger to them",
false, false, (int)false, &value);

Просмотреть файл

@ -285,6 +285,7 @@ ORTE_DECLSPEC extern orte_process_name_t orte_globals_name_invalid; /** instant
/* global variables used by RTE - instanced in orte_globals.c */
ORTE_DECLSPEC extern bool orte_debug_flag, orte_reuse_daemons, orte_timing;
ORTE_DECLSPEC extern bool orte_debug_daemons_flag, orte_debug_daemons_file_flag;
ORTE_DECLSPEC extern bool orte_do_not_launch;
ORTE_DECLSPEC extern bool orted_spin_flag;
ORTE_DECLSPEC extern bool orte_static_ports;
ORTE_DECLSPEC extern int orte_debug_output;

Просмотреть файл

@ -171,9 +171,12 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ NULL, NULL, NULL, '\0', "machinefile", "machinefile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile" },
{ "default", "hostfile", NULL, '\0', "default-hostfile", "default-hostfile", 1,
{ "orte", "default", "hostfile", '\0', "default-hostfile", "default-hostfile", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Provide a default hostfile" },
{ "opal", "if", "do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Do not attempt to resolve interfaces" },
/* uri of Open MPI server, or at least where to get it */
{ NULL, NULL, NULL, '\0', "ompi-server", "ompi-server", 1,
@ -266,17 +269,13 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable debugging of any OpenRTE daemons used by this application, storing output in files" },
{ "universe", NULL, NULL, '\0', NULL, "universe", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Set the universe name as username@hostname:universe_name for this application" },
{ NULL, NULL, NULL, '\0', NULL, "tmpdir", 1,
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree for orterun ONLY" },
{ NULL, NULL, NULL, '\0', NULL, "do-not-launch", 0,
&orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL,
"Perform all necessary operations to prepare to launch the application, but do not actually launch it" },
{ "orte", "do_not", "launch", '\0', NULL, "do-not-launch", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Perform all necessary operations to prepare to launch the application, but do not actually launch it" },
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,

Просмотреть файл

@ -45,7 +45,6 @@ struct orterun_globals_t {
bool exit;
bool by_node;
bool by_slot;
bool do_not_launch;
bool debugger;
int num_procs;
char *env_val;

Просмотреть файл

@ -126,11 +126,10 @@ static orte_node_t* hostfile_lookup(opal_list_t* nodes, const char* name)
return NULL;
}
static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exclude)
static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exclude, bool keep_all)
{
int rc;
orte_node_t* node;
bool update = false;
bool got_count = false;
bool got_max = false;
char* value;
@ -177,6 +176,11 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
node_name[i-1] = node_name[i];
}
node_name[len-1] = '\0'; /* truncate */
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s hostfile: node %s is being excluded",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name));
/* convert this into something globally unique */
if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) {
/* Nodename has been allocated, that is for sure */
@ -205,19 +209,20 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
node_name = strdup(orte_process_info.nodename);
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s hostfile: node %s is being included - keep all is %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name,
keep_all ? "TRUE" : "FALSE"));
/* Do we need to make a new node object? First check to see
if it's already in the updates list */
if (NULL == (node = hostfile_lookup(updates, node_name))) {
* if we are keeping everything or if it's already in the updates
* list. Because we check keep_all first, if that is set we will
* not do the hostfile_lookup call, and thus won't remove the
* pre-existing node from the updates list
*/
if (keep_all || NULL == (node = hostfile_lookup(updates, node_name))) {
node = OBJ_NEW(orte_node_t);
node->name = node_name;
/* Note that we need to set update to true regardless of
whether the node was found on the updates list or not.
If it was found, we just removed it (in hostfile_lookup()),
so the update puts it back
(potentially after updating it, of course). If it was
not found, then we have a new node instance that needs
to be added to the updates list. */
update = true;
}
} else {
hostfile_parse_error(token);
@ -251,7 +256,6 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
return ORTE_ERROR;
}
node->slots += rc;
update = true;
got_count = true;
/* Ensure that slots_max >= slots */
@ -273,7 +277,6 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
if (rc >= node->slots) {
if (node->slots_max != rc) {
node->slots_max = rc;
update = true;
got_max = true;
}
} else {
@ -299,18 +302,15 @@ static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exc
}
done:
if (update) {
if (!got_count) {
if (got_max) {
node->slots = node->slots_max;
} else {
++node->slots;
}
if (!got_count) {
if (got_max) {
node->slots = node->slots_max;
} else {
++node->slots;
}
opal_list_append(updates, &node->super);
} else {
OBJ_RELEASE(node);
}
opal_list_append(updates, &node->super);
return ORTE_SUCCESS;
}
@ -319,7 +319,7 @@ done:
* Parse the specified file into a node list.
*/
static int hostfile_parse(const char *hostfile, opal_list_t* updates, opal_list_t* exclude)
static int hostfile_parse(const char *hostfile, opal_list_t* updates, opal_list_t* exclude, bool keep_all)
{
int token;
int rc = ORTE_SUCCESS;
@ -358,7 +358,7 @@ static int hostfile_parse(const char *hostfile, opal_list_t* updates, opal_list_
case ORTE_HOSTFILE_HOSTNAME:
case ORTE_HOSTFILE_IPV4:
case ORTE_HOSTFILE_IPV6:
rc = hostfile_parse_line(token, updates, exclude);
rc = hostfile_parse_line(token, updates, exclude, keep_all);
if (ORTE_SUCCESS != rc) {
goto unlock;
}
@ -400,7 +400,7 @@ int orte_util_add_hostfile_nodes(opal_list_t *nodes,
OBJ_CONSTRUCT(&exclude, opal_list_t);
/* parse the hostfile and add the contents to the list */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude))) {
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, false))) {
goto cleanup;
}
@ -457,7 +457,7 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
/* parse the hostfile and create local list of findings */
OBJ_CONSTRUCT(&newnodes, opal_list_t);
OBJ_CONSTRUCT(&exclude, opal_list_t);
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &newnodes, &exclude))) {
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &newnodes, &exclude, false))) {
OBJ_DESTRUCT(&newnodes);
return rc;
}
@ -533,7 +533,6 @@ int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
}
int orte_util_get_ordered_host_list(opal_list_t *nodes,
bool *override_oversubscribed,
char *hostfile)
{
opal_list_t exclude;
@ -546,8 +545,8 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
OBJ_CONSTRUCT(&exclude, opal_list_t);
/* parse the hostfile and add the contents to the list */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude))) {
/* parse the hostfile and add the contents to the list, keeping duplicates */
if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, true))) {
goto cleanup;
}
@ -563,21 +562,14 @@ int orte_util_get_ordered_host_list(opal_list_t *nodes,
/* match - remove it */
opal_list_remove_item(nodes, itm);
OBJ_RELEASE(itm);
break;
/* have to cycle through the entire list as we could
* have duplicates
*/
}
}
OBJ_RELEASE(item);
}
/* indicate that ORTE should override any oversubscribed conditions
* based on local hardware limits since the user (a) might not have
* provided us any info on the #slots for a node, and (b) the user
* might have been wrong! If we don't check the number of local physical
* processors, then we could be too aggressive on our sched_yield setting
* and cause performance problems.
*/
*override_oversubscribed = true;
cleanup:
OBJ_DESTRUCT(&exclude);

Просмотреть файл

@ -38,7 +38,6 @@ ORTE_DECLSPEC int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
char *hostfile);
ORTE_DECLSPEC int orte_util_get_ordered_host_list(opal_list_t *nodes,
bool *override_oversubscribed,
char *hostfile);
END_C_DECLS