Correctly handle co-spawning of daemons when attaching to a running job. We cannot use the general process mappers as we only want debugger daemons spawned on nodes where application procs already exist. So custom build the map for the debugger daemon job, and have the plm just launch that job without doing its usual vm-spawn step.
This commit was SVN r25736.
Этот коммит содержится в:
родитель
11a37d3978
Коммит
6235a355de
orte
mca/plm
tools/orterun
@ -167,6 +167,20 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
orte_job_state_t job_state = ORTE_JOB_STATE_NEVER_LAUNCHED;
|
||||
orte_job_t *daemons;
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) {
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_launch = true;
|
||||
} else {
|
||||
failed_launch = false;
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* default to declaring the daemon launch failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
|
@ -877,6 +877,30 @@ static int rsh_launch(orte_job_t *jdata)
|
||||
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
|
||||
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) {
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_launch = true;
|
||||
goto cleanup;
|
||||
}
|
||||
/* wait for the launch to complete */
|
||||
OPAL_THREAD_LOCK(&orte_plm_globals.spawn_lock);
|
||||
while (!orte_plm_globals.spawn_complete) {
|
||||
opal_condition_wait(&orte_plm_globals.spawn_cond, &orte_plm_globals.spawn_lock);
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"completed spawn for job %s", ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
orte_plm_globals.spawn_in_progress = false;
|
||||
opal_condition_broadcast(&orte_plm_globals.spawn_in_progress_cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||
failed_launch = false;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* default to declaring the daemon launch as having failed */
|
||||
failed_job = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
|
@ -171,6 +171,20 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
bool failed_launch=true;
|
||||
orte_job_t *daemons;
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) {
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_launch = true;
|
||||
} else {
|
||||
failed_launch = false;
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
|
||||
|
@ -159,6 +159,20 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
char* vpid_string;
|
||||
orte_job_t *daemons;
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & jdata->controls) {
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_launch = true;
|
||||
} else {
|
||||
failed_launch = false;
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if we are timing, record the start time */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
|
||||
|
@ -144,7 +144,6 @@ void* MPIR_Breakpoint(void)
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
static orte_job_t *jdata=NULL;
|
||||
static char **global_mca_env = NULL;
|
||||
static orte_std_cntr_t total_num_apps = 0;
|
||||
static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT;
|
||||
@ -522,12 +521,14 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int create_app(int argc, char* argv[], orte_app_context_t **app,
|
||||
static int create_app(int argc, char* argv[],
|
||||
orte_job_t *jdata,
|
||||
orte_app_context_t **app,
|
||||
bool *made_app, char ***app_env);
|
||||
static int init_globals(void);
|
||||
static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line);
|
||||
static int parse_locals(int argc, char* argv[]);
|
||||
static int parse_appfile(char *filename, char ***env);
|
||||
static int parse_locals(orte_job_t *jdata, int argc, char* argv[]);
|
||||
static int parse_appfile(orte_job_t *jdata, char *filename, char ***env);
|
||||
static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
|
||||
|
||||
@ -539,6 +540,7 @@ int orterun(int argc, char *argv[])
|
||||
orte_job_t *daemons;
|
||||
int32_t ljob;
|
||||
orte_app_context_t *app, *dapp;
|
||||
orte_job_t *jdata=NULL;
|
||||
|
||||
/* find our basename (the name of the executable) so that we can
|
||||
use it in pretty-print error messages */
|
||||
@ -617,7 +619,7 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* Parse each app, adding it to the job object */
|
||||
parse_locals(argc, argv);
|
||||
parse_locals(jdata, argc, argv);
|
||||
|
||||
if (0 == jdata->num_apps) {
|
||||
/* This should never happen -- this case should be caught in
|
||||
@ -1000,7 +1002,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
|
||||
}
|
||||
|
||||
|
||||
static int parse_locals(int argc, char* argv[])
|
||||
static int parse_locals(orte_job_t *jdata, int argc, char* argv[])
|
||||
{
|
||||
int i, rc, app_num;
|
||||
int temp_argc;
|
||||
@ -1153,7 +1155,7 @@ static int parse_locals(int argc, char* argv[])
|
||||
env = NULL;
|
||||
}
|
||||
app = NULL;
|
||||
rc = create_app(temp_argc, temp_argv, &app, &made_app, &env);
|
||||
rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env);
|
||||
/** keep track of the number of apps - point this app_context to that index */
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
/* Assume that the error message has already been
|
||||
@ -1181,7 +1183,7 @@ static int parse_locals(int argc, char* argv[])
|
||||
|
||||
if (opal_argv_count(temp_argv) > 1) {
|
||||
app = NULL;
|
||||
rc = create_app(temp_argc, temp_argv, &app, &made_app, &env);
|
||||
rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
/* Assume that the error message has already been printed;
|
||||
no need to cleanup -- we can just exit */
|
||||
@ -1374,7 +1376,9 @@ static int capture_cmd_line_params(int argc, int start, char **argv)
|
||||
* with a NULL value for app_env, meaning that there is no "base"
|
||||
* environment that the app needs to be created from.
|
||||
*/
|
||||
static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
static int create_app(int argc, char* argv[],
|
||||
orte_job_t *jdata,
|
||||
orte_app_context_t **app_ptr,
|
||||
bool *made_app, char ***app_env)
|
||||
{
|
||||
opal_cmd_line_t cmd_line;
|
||||
@ -1418,7 +1422,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
|
||||
if (NULL != orterun_globals.appfile) {
|
||||
OBJ_DESTRUCT(&cmd_line);
|
||||
return parse_appfile(strdup(orterun_globals.appfile), app_env);
|
||||
return parse_appfile(jdata, strdup(orterun_globals.appfile), app_env);
|
||||
}
|
||||
|
||||
/* Setup application context */
|
||||
@ -1724,7 +1728,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr,
|
||||
}
|
||||
|
||||
|
||||
static int parse_appfile(char *filename, char ***env)
|
||||
static int parse_appfile(orte_job_t *jdata, char *filename, char ***env)
|
||||
{
|
||||
size_t i, len;
|
||||
FILE *fp;
|
||||
@ -1836,7 +1840,7 @@ static int parse_appfile(char *filename, char ***env)
|
||||
tmp_env = NULL;
|
||||
}
|
||||
|
||||
rc = create_app(argc, argv, &app, &made_app, &tmp_env);
|
||||
rc = create_app(argc, argv, jdata, &app, &made_app, &tmp_env);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
/* Assume that the error message has already been
|
||||
printed; no need to cleanup -- we can just exit */
|
||||
@ -2482,6 +2486,10 @@ static void attach_debugger(int fd, short event, void *arg)
|
||||
int rc;
|
||||
int32_t ljob;
|
||||
orte_job_t *jdata;
|
||||
int i;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
orte_vpid_t vpid=0;
|
||||
|
||||
/* read the file descriptor to clear that event, if necessary */
|
||||
if (fifo_active) {
|
||||
@ -2541,6 +2549,8 @@ static void attach_debugger(int fd, short event, void *arg)
|
||||
if (!MPIR_forward_output) {
|
||||
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
}
|
||||
/* dont push stdin */
|
||||
jdata->stdin_target = ORTE_VPID_INVALID;
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
@ -2558,12 +2568,52 @@ static void attach_debugger(int fd, short event, void *arg)
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(jdata->apps, app);
|
||||
jdata->num_apps = 1;
|
||||
/* setup the mapping policy to pernode so we get one
|
||||
* daemon on each node
|
||||
*/
|
||||
/* create a job map */
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
jdata->map->mapping = ORTE_MAPPING_PPR;
|
||||
jdata->map->ppr = strdup("1:n");
|
||||
/* in building the map, we want to launch one debugger daemon
|
||||
* on each node that *already has an application process on it*.
|
||||
* We cannot just launch one debugger daemon on EVERY node because
|
||||
* the original job may not have placed procs on every node. So
|
||||
* we construct the map here by cycling across all nodes, adding
|
||||
* only those nodes where num_procs > 0.
|
||||
*/
|
||||
for (i=0; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
}
|
||||
/* if this node wasn't included in the vm, ignore it */
|
||||
if (NULL == node->daemon) {
|
||||
continue;
|
||||
}
|
||||
/* if the node doesn't have any app procs on it, ignore it */
|
||||
if (node->num_procs < 1) {
|
||||
continue;
|
||||
}
|
||||
/* this node has at least one proc, so add it to our map */
|
||||
OBJ_RETAIN(node);
|
||||
opal_pointer_array_add(jdata->map->nodes, node);
|
||||
jdata->map->num_nodes++;
|
||||
/* add a debugger daemon to the node - note that the
|
||||
* debugger daemon does NOT count against our subscribed slots
|
||||
*/
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
proc->name.jobid = jdata->jobid;
|
||||
proc->name.vpid = vpid++;
|
||||
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
|
||||
/* flag the proc as ready for launch */
|
||||
proc->state = ORTE_PROC_STATE_INIT;
|
||||
proc->app_idx = 0;
|
||||
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
|
||||
|
||||
/* add the proc to the node's array */
|
||||
OBJ_RETAIN(proc);
|
||||
opal_pointer_array_add(node->procs, (void*)proc);
|
||||
node->num_procs++;
|
||||
}
|
||||
/* now go ahead and spawn this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user