1
1

Add a new option to orterun: "--do-not-launch" directs the system to do the allocation, map, job setup, etc., but don't actually launch the job. This lets us test all the setup portions of the code.

Also, take the first step in updating how we handle mca params in ORTE - bring it closer to how it is done in the other two layers. Much more work to be done here.

This commit was SVN r12838.
Этот коммит содержится в:
Ralph Castain 2006-12-13 04:51:38 +00:00
родитель 80bc0c8868
Коммит 82946cb220
2 изменённых файлов: 40 добавлений и 145 удалений

Просмотреть файл

@ -25,6 +25,22 @@
int orte_register_params(bool infrastructure)
{
mca_base_param_reg_int_name("orte", "debug",
"Top-level ORTE debug switch",
false, false, (int)false, NULL);
mca_base_param_reg_int_name("orte_debug", "daemons_file",
"Whether want stdout/stderr of daemons to go to a file or not",
false, false, (int)false, NULL);
mca_base_param_reg_int_name("orte", "no_daemonize",
"Whether to properly daemonize the ORTE daemons or not",
false, false, (int)false, NULL);
mca_base_param_reg_int_name("orte_debug", "daemons",
"Whether to debug the ORTE daemons or not",
false, false, (int)false, NULL);
mca_base_param_reg_int_name("orte", "infrastructure",
"Whether we are ORTE infrastructure or an ORTE application",
true, true, (int)infrastructure, NULL);

Просмотреть файл

@ -109,13 +109,9 @@ struct globals_t {
bool no_wait_for_job_completion;
bool by_node;
bool by_slot;
bool per_node;
bool no_oversubscribe;
bool do_not_launch;
bool debugger;
bool no_local_schedule;
bool reuse_daemons;
int num_procs;
int n_per_node;
int exit_status;
char *hostfile;
char *env_val;
@ -195,13 +191,13 @@ opal_cmd_line_init_t cmd_line_init[] = {
&orterun_globals.by_slot, OPAL_CMD_LINE_TYPE_BOOL,
"Whether to allocate/map processes round-robin by slot (the default)" },
{ NULL, NULL, NULL, '\0', "pernode", "pernode", 0,
&orterun_globals.per_node, OPAL_CMD_LINE_TYPE_BOOL,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Launch one process per available node on the specified number of nodes [no -np => use all allocated nodes]" },
{ NULL, NULL, NULL, '\0', "npernode", "npernode", 1,
&orterun_globals.n_per_node, OPAL_CMD_LINE_TYPE_INT,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per node on all allocated nodes" },
{ NULL, NULL, NULL, '\0', "nooversubscribe", "nooversubscribe", 0,
&orterun_globals.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes are not to be oversubscribed, even if the system supports such operation"},
{ "rmaps", "base", "display_map", '\0', "display-map-at-launch", "display-map-at-launch", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
@ -227,7 +223,7 @@ opal_cmd_line_init_t cmd_line_init[] = {
/* OSC mpiexec-like arguments */
{ NULL, NULL, NULL, '\0', "nolocal", "nolocal", 0,
&orterun_globals.no_local_schedule, OPAL_CMD_LINE_TYPE_BOOL,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Do not run any MPI applications on the local node" },
/* User-level debugger arguments */
@ -266,8 +262,12 @@ opal_cmd_line_init_t cmd_line_init[] = {
&orte_process_info.tmpdir_base, OPAL_CMD_LINE_TYPE_STRING,
"Set the root for the session directory tree for orterun ONLY" },
{ NULL, NULL, NULL, '\0', NULL, "do-not-launch", 0,
&orterun_globals.do_not_launch, OPAL_CMD_LINE_TYPE_BOOL,
"Perform all necessary operations to prepare to launch the application, but do not actually launch it" },
{ NULL, NULL, NULL, '\0', "reuse-daemons", "reuse-daemons", 0,
&orterun_globals.reuse_daemons, OPAL_CMD_LINE_TYPE_BOOL,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"If set, reuse daemons to launch dynamically spawned processes"},
{ NULL, NULL, NULL, '\0', NULL, "prefix", 1,
@ -305,11 +305,11 @@ int orterun(int argc, char *argv[])
{
orte_app_context_t **apps;
int rc, ret, i, num_apps, array_size;
int id, iparam;
orte_proc_state_t cb_states;
orte_job_state_t exit_state;
opal_list_t attributes;
opal_list_item_t *item;
uint8_t flow;
/* Setup MCA params */
@ -365,77 +365,6 @@ int orterun(int argc, char *argv[])
return rc;
}
/* check for daemon flags and push them into the environment
* since this isn't being automatically done
*/
id = mca_base_param_reg_int_name("orte_debug", "daemons",
"Whether to debug the ORTE daemons or not",
false, false, (int)false, &iparam);
if (iparam) {
char *tmp = mca_base_param_environ_variable("orte", "debug", "daemons");
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
opal_show_help("help-orterun.txt", "orterun:environ", false,
orterun_basename, tmp, "1", rc);
free(tmp);
return rc;
}
free(tmp);
}
id = mca_base_param_reg_int_name("orte", "debug",
"Top-level ORTE debug switch",
false, false, 0, &iparam);
if (iparam) {
char *tmp = mca_base_param_environ_variable("orte", NULL, "debug");
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
opal_show_help("help-orterun.txt", "orterun:environ", false,
orterun_basename, tmp, "1", rc);
free(tmp);
return rc;
}
free(tmp);
}
id = mca_base_param_reg_int_name("orte_debug", "daemons_file",
"Whether want stdout/stderr of daemons to go to a file or not",
false, false, 0, &iparam);
if (iparam) {
char *tmp = mca_base_param_environ_variable("orte", "debug",
"daemons_file");
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
opal_show_help("help-orterun.txt", "orterun:environ", false,
orterun_basename, tmp, "1", rc);
free(tmp);
return rc;
}
free(tmp);
}
id = mca_base_param_reg_int_name("orte", "no_daemonize",
"Whether to properly daemonize the ORTE daemons or not",
false, false, 0, &iparam);
if (iparam) {
char *tmp = mca_base_param_environ_variable("orte", "no_daemonize", NULL);
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
opal_show_help("help-orterun.txt", "orterun:environ", false,
orterun_basename, tmp, "1", rc);
free(tmp);
return rc;
}
free(tmp);
}
id = mca_base_param_reg_int_name("rmaps_base", "display_map",
"Whether to display the process map after it is computed",
false, false, (int)false, &iparam);
if (iparam) {
char *tmp = mca_base_param_environ_variable("rmaps", "base", "display_map");
if (ORTE_SUCCESS != (rc = opal_setenv(tmp, "1", true, &environ))) {
opal_show_help("help-orterun.txt", "orterun:environ", false,
orterun_basename, tmp, "1", rc);
free(tmp);
return rc;
}
free(tmp);
}
/* pre-condition any network transports that require it */
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(apps, num_apps))) {
ORTE_ERROR_LOG(rc);
@ -448,6 +377,11 @@ int orterun(int argc, char *argv[])
/* Prep to start the application */
/* construct the list of attributes */
OBJ_CONSTRUCT(&attributes, opal_list_t);
if (orterun_globals.do_not_launch) {
flow = ORTE_RMGR_SETUP | ORTE_RMGR_RES_DISC | ORTE_RMGR_ALLOC | ORTE_RMGR_MAP | ORTE_RMGR_SETUP_TRIGS;
orte_rmgr.add_attribute(&attributes, ORTE_RMGR_SPAWN_FLOW, ORTE_UINT8, &flow, ORTE_RMGR_ATTR_OVERRIDE);
}
/** setup callbacks for abort signals */
opal_signal_set(&term_handler, SIGTERM,
@ -480,6 +414,11 @@ int orterun(int argc, char *argv[])
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
} else {
if (orterun_globals.do_not_launch) {
/* we are done! */
goto DONE;
}
/* Wait for the app to complete */
if (wait_for_job_completion) {
@ -538,9 +477,7 @@ int orterun(int argc, char *argv[])
}
}
/* All done */
OBJ_DESTRUCT(&attributes);
DONE:
for (i = 0; i < num_apps; ++i) {
OBJ_RELEASE(apps[i]);
}
@ -858,12 +795,9 @@ static int init_globals(void)
orterun_globals.no_wait_for_job_completion = false;
orterun_globals.by_node = false;
orterun_globals.by_slot = false;
orterun_globals.per_node = false;
orterun_globals.no_oversubscribe = false;
orterun_globals.debugger = false;
orterun_globals.no_local_schedule = false;
orterun_globals.do_not_launch = false;
orterun_globals.num_procs = 0;
orterun_globals.n_per_node = -1;
orterun_globals.exit_status = 0;
if( NULL != orterun_globals.hostfile )
free( orterun_globals.hostfile );
@ -972,62 +906,7 @@ static int parse_globals(int argc, char* argv[])
orterun_globals.by_slot = true;
}
/* did the user request "pernode", indicating we are to spawn one ppn
* if no np is provided
*/
if (orterun_globals.per_node) {
id = mca_base_param_reg_int_name("rmaps", "base_pernode",
"Launch one ppn as directed",
false, false, (int)false, &ret);
if (orterun_globals.per_node) {
mca_base_param_set_int(id, (int)true);
} else {
mca_base_param_set_int(id, (int)false);
}
}
/* did the user request "npernode", indicating we are to spawn N ppn */
id = mca_base_param_reg_int_name("rmaps", "base_n_pernode",
"Launch n procs/node",
false, false, -1, &ret);
mca_base_param_set_int(id, orterun_globals.n_per_node);
/** Do we want to disallow oversubscription of nodes? */
id = mca_base_param_reg_int_name("rmaps", "base_no_oversubscribe",
"If nonzero, do not allow oversubscription of processes on nodes. If zero (default), oversubscription is allowed.",
false, false, 0, &ret);
if (orterun_globals.no_oversubscribe) {
mca_base_param_set_int(id, (int)true);
} else {
mca_base_param_set_int(id, (int)false);
}
/* Do we want to allow MPI applications on the same node as
mpirun? */
id = mca_base_param_reg_int_name("rmaps", "base_schedule_local",
"If nonzero, allow scheduling MPI applications on the same node as mpirun (default). If zero, do not schedule any MPI applications on the same node as mpirun",
false, false, 1, &ret);
if (orterun_globals.no_local_schedule) {
mca_base_param_set_int(id, 0);
} else {
mca_base_param_set_int(id, 1);
}
if (orterun_globals.reuse_daemons) {
id = mca_base_param_reg_int_name("pls", "base_reuse_daemons",
"If nonzero, reuse daemons to launch dynamically spawned processes. If zero, do not reuse daemons (default)",
false, false, 0, &ret);
if (orterun_globals.reuse_daemons) {
mca_base_param_set_int(id, (int)true);
} else {
mca_base_param_set_int(id, (int)false);
}
}
/* If we don't want to wait, we don't want to wait */
/* If we don't want to wait, we don't want to wait */
if (orterun_globals.no_wait_for_job_completion) {
wait_for_job_completion = false;