1
1

Final merge of stuff from /tmp/tm-stuff tree (merged through

/tmp/tm-merge).  Validated by RHC.  Summary:

- Add --nolocal (and -nolocal) options to orterun
- Make some scalability improvements to the tm pls

This commit was SVN r10651.
Этот коммит содержится в:
Jeff Squyres 2006-07-04 20:12:35 +00:00
родитель d2bf3844e9
Коммит 538965aeb0
8 изменённых файлов: 240 добавлений и 171 удалений

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,6 +33,7 @@ extern "C" {
orte_pls_base_component_t super;
int priority;
int debug;
int verbose;
bool want_path_check;
char *orted;
char **checked_paths;

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -92,8 +93,10 @@ static int pls_tm_open(void)
int tmp;
mca_base_component_t *comp = &mca_pls_tm_component.super.pls_version;
mca_base_param_reg_int(comp, "debug", "Enable debugging of TM pls",
mca_base_param_reg_int(comp, "debug", "Enable debugging of the TM pls",
false, false, 0, &mca_pls_tm_component.debug);
mca_base_param_reg_int(comp, "verbose", "Enable verbose output of the TM pls",
false, false, 0, &mca_pls_tm_component.verbose);
mca_base_param_reg_int(comp, "priority", "Default selection priority",
false, false, 75, &mca_pls_tm_component.priority);

Просмотреть файл

@ -38,6 +38,7 @@
#include <tm.h>
#include "opal/install_dirs.h"
#include "opal/event/event.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
@ -64,6 +65,7 @@
#include "pls_tm.h"
/*
* Local functions
*/
@ -98,8 +100,8 @@ extern char **environ;
static int
pls_tm_launch(orte_jobid_t jobid)
{
opal_list_t nodes, mapping_list;
opal_list_item_t *item, *item2;
opal_list_t mapping;
opal_list_item_t *m_item, *n_item;
size_t num_nodes;
orte_vpid_t vpid;
int node_name_index;
@ -110,27 +112,31 @@ pls_tm_launch(orte_jobid_t jobid)
int argc;
int rc;
bool connected = false;
opal_list_t map;
char *cur_prefix;
int launched = 0, i;
char *bin_base = NULL, *lib_base = NULL;
/* Query the list of nodes allocated and mapped to this job.
/* Query the list of nodes allocated and mapped to this job.
* We need the entire mapping for a couple of reasons:
* - need the prefix to start with.
* - need to know if we are launching on a subset of the allocated nodes
* All other mapping responsibilities fall to orted in the fork PLS
*/
OBJ_CONSTRUCT(&nodes, opal_list_t);
OBJ_CONSTRUCT(&mapping_list, opal_list_t);
rc = orte_rmaps_base_mapped_node_query(&mapping_list, &nodes, jobid);
OBJ_CONSTRUCT(&mapping, opal_list_t);
rc = orte_rmaps_base_get_map(jobid, &mapping);
if (ORTE_SUCCESS != rc) {
goto cleanup;
}
num_nodes = 0;
for(m_item = opal_list_get_first(&mapping);
m_item != opal_list_get_end(&mapping);
m_item = opal_list_get_next(m_item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
num_nodes += opal_list_get_size(&map->nodes);
}
/*
* Allocate a range of vpids for the daemons.
*/
num_nodes = opal_list_get_size(&nodes);
if (num_nodes == 0) {
return ORTE_ERR_BAD_PARAM;
}
@ -226,100 +232,23 @@ pls_tm_launch(orte_jobid_t jobid)
bin_base = opal_basename(OPAL_BINDIR);
/*
* Iterate through each of the nodes and spin
* up a daemon.
* iterate through each of the contexts
*/
for(item = opal_list_get_first(&nodes);
item != opal_list_get_end(&nodes);
item = opal_list_get_next(item)) {
orte_ras_node_t* node = (orte_ras_node_t*)item;
orte_process_name_t* name;
char* name_string;
for (m_item = opal_list_get_first(&mapping);
m_item != opal_list_get_end(&mapping);
m_item = opal_list_get_next(m_item)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*)m_item;
char** env;
char* var;
size_t num_processes;
OBJ_CONSTRUCT(&map, opal_list_t);
/* Get the mapping of this very node */
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid,
jobid,
node->node_name,
&map);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* Copy the prefix-directory specified within the
corresponding app_context. If there are multiple,
different prefix's for this node, complain */
cur_prefix = NULL;
num_processes = 0;
for (item2 = opal_list_get_first(&map);
item2 != opal_list_get_end(&map);
item2 = opal_list_get_next(item2)) {
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item2;
char * app_prefix_dir = map->app->prefix_dir;
/* Increment the number of processes allocated to this node
* This allows us to accurately test for oversubscription */
num_processes += map->num_procs;
/* Check for already set cur_prefix -- if different,
complain */
if (NULL != app_prefix_dir) {
if (NULL != cur_prefix &&
0 != strcmp (cur_prefix, app_prefix_dir)) {
opal_show_help("help-pls-tm.txt", "multiple-prefixes",
true, node->node_name,
cur_prefix, app_prefix_dir);
return ORTE_ERR_FATAL;
}
/* If not yet set, copy it; iff set, then it's the
same anyway */
if (NULL == cur_prefix) {
cur_prefix = strdup(map->app->prefix_dir);
if (mca_pls_tm_component.debug) {
opal_output (0, "pls:tm: Set prefix:%s",
cur_prefix);
}
}
}
}
/* setup node name */
argv[node_name_index] = node->node_name;
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup per-node options */
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: launching on node %s",
node->node_name);
}
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: unable to create process name");
return rc;
}
argv[proc_name_index] = name_string;
/* setup environment */
env = opal_argv_copy(environ);
var = mca_base_param_environ_variable("seed",NULL,NULL);
opal_setenv(var, "0", true, &env);
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables. */
if (NULL != cur_prefix) {
if (NULL != map->app->prefix_dir) {
int i;
char *newenv;
@ -327,7 +256,7 @@ pls_tm_launch(orte_jobid_t jobid)
/* Reset PATH */
if (0 == strncmp("PATH=", env[i], 5)) {
asprintf(&newenv, "%s/%s:%s",
cur_prefix, bin_base, env[i] + 5);
map->app->prefix_dir, bin_base, env[i] + 5);
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: resetting PATH: %s",
newenv);
@ -335,11 +264,11 @@ pls_tm_launch(orte_jobid_t jobid)
opal_setenv("PATH", newenv, true, &env);
free(newenv);
}
/* Reset LD_LIBRARY_PATH */
else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
asprintf(&newenv, "%s/%s:%s",
cur_prefix, lib_base, env[i] + 16);
map->app->prefix_dir, lib_base, env[i] + 16);
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: resetting LD_LIBRARY_PATH: %s",
newenv);
@ -348,9 +277,8 @@ pls_tm_launch(orte_jobid_t jobid)
free(newenv);
}
}
free(cur_prefix);
}
/* Do a quick sanity check to ensure that we can find the
orted in the PATH */
@ -361,68 +289,122 @@ pls_tm_launch(orte_jobid_t jobid)
true, argv[0]);
goto cleanup;
}
/* set the progress engine schedule for this node.
* if node_slots is set to zero, then we default to
* NOT being oversubscribed
/* Iterate through each of the nodes and spin
* up a daemon.
*/
if (node->node_slots > 0 &&
num_processes > node->node_slots) {
for (n_item = opal_list_get_first(&map->nodes);
n_item != opal_list_get_end(&map->nodes);
n_item = opal_list_get_next(n_item)) {
orte_rmaps_base_node_t* rmaps_node = (orte_rmaps_base_node_t*)n_item;
orte_ras_node_t* node = rmaps_node->node;
orte_process_name_t* name;
char* name_string;
size_t num_processes = 0;
/* already launched on this node */
if (0 != node->node_launched++) {
continue;
}
/* setup node name */
argv[node_name_index] = node->node_name;
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup per-node options */
if (mca_pls_tm_component.debug ||
mca_pls_tm_component.verbose) {
opal_output(0, "pls:tm: launching on node %s",
node->node_name);
}
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: unable to create process name");
return rc;
}
argv[proc_name_index] = name_string;
/* set the progress engine schedule for this node.
* if node_slots is set to zero, then we default to
* NOT being oversubscribed
*/
if (node->node_slots > 0 &&
opal_list_get_size(&rmaps_node->node_procs) > node->node_slots) {
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)",
node->node_slots,
opal_list_get_size(&rmaps_node->node_procs));
}
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(var, "1", true, &env);
} else {
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: not oversubscribed -- setting mpi_yield_when_idle to 0");
}
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(var, "0", true, &env);
}
free(var);
/* save the daemons name on the node */
if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(node,jobid,name))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* exec the daemon */
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: oversubscribed -- setting mpi_yield_when_idle to 1 (%d %d)",
node->node_slots, num_processes);
param = opal_argv_join(argv, ' ');
if (NULL != param) {
opal_output(0, "pls:tm: executing: %s", param);
free(param);
}
}
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(var, "1", true, &env);
} else {
if (mca_pls_tm_component.debug) {
opal_output(0, "pls:tm: not oversubscribed -- setting mpi_yield_when_idle to 0");
rc = pls_tm_start_proc(node->node_name, argc, argv, env);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: start_procs returned error %d", rc);
goto cleanup;
}
var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
opal_setenv(var, "0", true, &env);
launched++;
vpid++;
free(name);
opal_event_loop(OPAL_EVLOOP_NONBLOCK);
}
free(var);
/* save the daemons name on the node */
if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(node,jobid,name))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* exec the daemon */
if (mca_pls_tm_component.debug) {
param = opal_argv_join(argv, ' ');
if (NULL != param) {
opal_output(0, "pls:tm: executing: %s", param);
free(param);
}
}
rc = pls_tm_start_proc(node->node_name, argc, argv, env);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: start_procs returned error %d", rc);
goto cleanup;
}
vpid++;
free(name);
}
cleanup:
/* loop through all those that are launched and poll for
completion status */
for(i = 0; i < launched; i++){
int ret, local_err;
tm_event_t event;
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
if (TM_SUCCESS != ret) {
errno = local_err;
opal_output(0, "pls:tm: failed to start a proc error %d", ret);
goto cleanup;
}
}
cleanup:
if (connected) {
pls_tm_disconnect();
}
while (NULL != (item = opal_list_remove_first(&nodes))) {
OBJ_RELEASE(item);
while (NULL != (m_item = opal_list_remove_first(&mapping))) {
OBJ_RELEASE(m_item);
}
OBJ_DESTRUCT(&nodes);
while (NULL != (item = opal_list_remove_first(&mapping_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&mapping_list);
OBJ_DESTRUCT(&mapping);
if (NULL != lib_base) {
free(lib_base);
}
@ -520,6 +502,8 @@ static tm_node_id *tm_node_ids = NULL;
static int num_tm_hostnames, num_node_ids;
/* we don't call this anymore */
/*
* For a given TM node ID, get the string hostname corresponding to
* it.
@ -527,10 +511,10 @@ static int num_tm_hostnames, num_node_ids;
static char*
get_tm_hostname(tm_node_id node)
{
int ret, local_errno;
char *hostname;
tm_event_t event;
char buffer[256];
int ret, local_errno;
tm_event_t event;
char **argv;
/* Get the info string corresponding to this TM node ID */
@ -565,6 +549,7 @@ get_tm_hostname(tm_node_id node)
}
/* we don't call this anymore!*/
static int
query_tm_hostnames(void)
{
@ -598,7 +583,7 @@ query_tm_hostnames(void)
return ORTE_SUCCESS;
}
/* we don't call this anymore! */
static int
do_tm_resolve(char *hostname, tm_node_id *tnodeid)
{
@ -637,7 +622,7 @@ do_tm_resolve(char *hostname, tm_node_id *tnodeid)
static int
pls_tm_start_proc(char *nodename, int argc, char **argv, char **env)
{
int ret, local_err;
int ret;
tm_node_id node_id;
tm_task_id task_id;
tm_event_t event;
@ -649,12 +634,6 @@ pls_tm_start_proc(char *nodename, int argc, char **argv, char **env)
ret = tm_spawn(argc, argv, env, node_id, &task_id, &event);
if (TM_SUCCESS != ret) return ORTE_ERROR;
ret = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
if (TM_SUCCESS != ret) {
errno = local_err;
return ORTE_ERR_IN_ERRNO;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -173,7 +174,10 @@ static int discover(opal_list_t* nodelist)
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* Iterate through all the nodes and make an entry for each */
/* Iterate through all the nodes and make an entry for each. TM
node ID's will never be duplicated, but they may end up
resolving to the same hostname (i.e., vcpu's on a single
host). */
OBJ_CONSTRUCT(&new_nodes, opal_list_t);
for (i = 0; i < num_node_ids; ++i) {
@ -296,6 +300,6 @@ static int get_tm_hostname(tm_node_id node, char **hostname, char **arch)
/* All done */
opal_output(orte_ras_base.ras_output,
"ras:tm:hostname: got hostname %s", hostname);
"ras:tm:hostname: got hostname %s", *hostname);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -67,7 +68,7 @@ int orte_rmaps_base_open(void)
/* Debugging / verbose output */
param = mca_base_param_reg_int_name("rmaps_base", "verbose",
param = mca_base_param_reg_int_name("rmaps", "base_verbose",
"Verbosity level for the rmaps framework",
false, false, 0, &value);
if (value != 0) {
@ -78,13 +79,19 @@ int orte_rmaps_base_open(void)
/* Are we scheduling by node or by slot? */
param = mca_base_param_reg_string_name("rmaps_base", "schedule_policy",
param = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
"Scheduling Policy for RMAPS. [slot | node]",
false, false, "slot", &policy);
if (0 == strcmp(policy, "node")) {
mca_base_param_set_string(param, "node");
}
/* Should we schedule on the local node or not? */
mca_base_param_reg_int_name("rmaps", "base_schedule_local",
"If nonzero, allow scheduling MPI applications on the same node as mpirun (default). If zero, do not schedule any MPI applications on the same node as mpirun",
false, false, 1, &value);
/* Open up all the components that we can find */
if (ORTE_SUCCESS !=

Просмотреть файл

@ -9,12 +9,14 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
@ -24,11 +26,14 @@
#include <string.h>
#endif /* HAVE_STRING_H */
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/util/argv.h"
#include "opal/util/if.h"
#include "orte/orte_constants.h"
#include "orte/orte_types.h"
#include "orte/util/sys_info.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/rmaps/base/base.h"
@ -416,7 +421,8 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
int rc = ORTE_SUCCESS;
bool bynode = true;
char **mapped_nodes = NULL;
int num_mapped_nodes = 0;
int num_mapped_nodes = 0;
int id, value;
/* query for the application context and allocated nodes */
if(ORTE_SUCCESS != (rc = orte_rmgr_base_get_app_context(jobid, &context, &num_context))) {
@ -437,6 +443,24 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
return rc;
}
/* If the "no local" option was set, then remove the local node
from the list */
id = mca_base_param_find("rmaps", NULL, "base_schedule_local");
mca_base_param_lookup_int(id, &value);
if (0 == value) {
for (item = opal_list_get_first(&nodes);
item != opal_list_get_end(&nodes);
item = opal_list_get_next(item) ) {
if (0 == strcmp(((orte_ras_node_t *) item)->node_name,
orte_system_info.nodename) ||
opal_ifislocal(((orte_ras_node_t *) item)->node_name)) {
opal_list_remove_item(&nodes, item);
break;
}
}
}
/* Sanity check to make sure we have been allocated nodes */
if (0 == opal_list_get_size(&nodes)) {
OBJ_DESTRUCT(&nodes);
@ -489,7 +513,8 @@ static int orte_rmaps_rr_map(orte_jobid_t jobid)
end, bounce back to the front (as would happen in the loop
below)
But do a bozo check to ensure that we don't have a empty node list.*/
But do a bozo check to ensure that we don't have a empty
node list.*/
if (0 == opal_list_get_size(&nodes)) {
rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;

Просмотреть файл

@ -181,6 +181,13 @@ Synonym for \fI-np\fP.
.
.
.TP
.B -nolocal\fR,\fP --nolocal
Do not run any copies of the launched application on the same node as
orterun is running. This option will override listing the localhost
with \fB--host\fR or any other host-specifying mechanism.
.
.
.TP
.B -np \fR<#>\fP
Run this many copies of the program on the given nodes. This option
indicates that the specified file is an executable program and not an
@ -409,6 +416,30 @@ on hosts b and c.
.
.
.
.SS No Local Launch
.
Using the \fB--nolocal\fR option to orterun tells the system to not
launch any of the application processes on the same node that orterun
is running. While orterun typically blocks and consumes few system
resources, this option can be helpful for launching very large jobs
where orterun may actually need to use noticable amounts of memory
and/or processing time. \fB--nolocal\fR allows orteun to run without
sharing the local node with the launched applications, and likewise
allows the launched applications to run unhindered by orterun's system
usage.
.PP
Note that \fB--nolocal\fR will override any other specification to
launch the application on the local node. It will disqualify the
localhost from being capable of running any processes in the
application.
.
.
.TP
shell$ mpirun -np 1 --host localhost --nolocal hostname
This example will result in an error because orterun will not find
anywhere to launch the application.
.
.
.SS Application Context or Executable Program?
.
To distinguish the two different forms, \fImpirun\fP

Просмотреть файл

@ -105,6 +105,7 @@ struct globals_t {
bool by_node;
bool by_slot;
bool debugger;
bool no_local_schedule;
size_t num_procs;
int exit_status;
char *hostfile;
@ -209,6 +210,11 @@ opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_STRING,
"List of hosts to invoke processes on" },
/* OSC mpiexec-like arguments */
{ NULL, NULL, NULL, '\0', "nolocal", "nolocal", 0,
&orterun_globals.no_local_schedule, OPAL_CMD_LINE_TYPE_BOOL,
"Do not run any MPI applications on the local node" },
/* User-level debugger arguments */
{ NULL, NULL, NULL, '\0', "tv", "tv", 0,
&orterun_globals.debugger, OPAL_CMD_LINE_TYPE_BOOL,
@ -761,6 +767,7 @@ static int init_globals(void)
false,
false,
false,
false,
0,
0,
NULL,
@ -855,7 +862,7 @@ static int parse_globals(int argc, char* argv[])
* since it really should be initialized in rmaps_base_open */
if (orterun_globals.by_node || orterun_globals.by_slot) {
char *policy = NULL;
id = mca_base_param_reg_string_name("rmaps_base", "schedule_policy",
id = mca_base_param_reg_string_name("rmaps", "base_schedule_policy",
"Scheduling policy for RMAPS. [slot | node]",
false, false, "slot", &policy);
@ -873,6 +880,17 @@ static int parse_globals(int argc, char* argv[])
orterun_globals.by_slot = true;
}
/* Do we want to allow MPI applications on the same node as
mpirun? */
id = mca_base_param_reg_int_name("rmaps", "base_schedule_local",
"If nonzero, allow scheduling MPI applications on the same node as mpirun (default). If zero, do not schedule any MPI applications on the same node as mpirun",
false, false, 1, &ret);
if (orterun_globals.no_local_schedule) {
mca_base_param_set_int(id, 0);
} else {
mca_base_param_set_int(id, 1);
}
/* If we don't want to wait, we don't want to wait */
if (orterun_globals.no_wait_for_job_completion) {