1
1

Enable co-location of debugger daemons during initial launch and when debugging a running job.

Provide support for four MPIR extensions that allow specification of debugger daemon executable, argv for the debugger daemon, whether or not to forward debugger daemon IO, and whether or not debugger daemon will piggy-back on ORTE OOB network. Last is not yet implemented.

No change in behavior or operation occurs unless (a) the debugger specifically utilizes the extensions and, for co-locate while running, the user specifically enables the capability via an MCA param. Two of the MPIR extensions supported here are used in a widely-used debugger for a large-scale installation. The other two extensions are new and being utilized in prototype work by several debuggers for possible future release.

This commit was SVN r19275.
Этот коммит содержится в:
Ralph Castain 2008-08-13 17:47:24 +00:00
родитель 89ec513524
Коммит 30f37f762d
11 изменённых файлов: 429 добавлений и 72 удалений

Просмотреть файл

@ -86,6 +86,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr;
int32_t numbytes;
int8_t flag;
/* get the job data pointer */
if (NULL == (jdata = orte_get_job_data_object(job))) {
@ -148,6 +149,41 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
}
OBJ_RELEASE(wireup);
/* are we co-locating debugger daemons? */
if (NULL != orte_debugger_daemon) {
orte_app_context_t **apps;
/* flag that we are */
flag = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the jobid for the debugger daemons */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &orte_debugger_daemon->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the executable name */
apps = (orte_app_context_t**)orte_debugger_daemon->apps->addr;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, apps, 1, ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the control flags */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &orte_debugger_daemon->controls, 1, OPAL_UINT16))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
/* flag that we are NOT */
flag = 0;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &flag, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack the jobid so it can be extracted later */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
@ -160,6 +196,12 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc;
}
/* pack the control flags for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->controls, 1, OPAL_UINT16))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the number of app_contexts for this job */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &jdata->num_apps, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
@ -210,6 +252,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
opal_list_item_t *item;
orte_namelist_t *nm;
opal_list_t daemon_tree;
int8_t flag;
orte_jobid_t debugger;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:constructing child list",
@ -273,6 +317,49 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
OBJ_DESTRUCT(&wireup);
}
/* unpack the flag - are we co-locating debugger daemons? */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
if (0 != flag) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:construct_child_list unpacking debugger daemon",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* yep - create a jobdat object for it. In this case, we don't have to
* worry about race conditions as the debugger daemons do not use
* the daemon collective system
*/
orte_odls_globals.debugger = OBJ_NEW(orte_odls_job_t);
/* get the debugger daemon jobid */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &debugger, &cnt, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
orte_odls_globals.debugger->jobid = debugger;
orte_odls_globals.debugger->num_apps = 1;
orte_odls_globals.debugger->num_local_procs = 1;
opal_list_append(&orte_odls_globals.jobs, &(orte_odls_globals.debugger)->super);
/* retrieve the info */
orte_odls_globals.debugger->apps = (orte_app_context_t**)malloc(sizeof(orte_app_context_t*));
if (NULL == orte_odls_globals.debugger->apps) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto REPORT_ERROR;
}
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, orte_odls_globals.debugger->apps,
&(orte_odls_globals.debugger->num_apps), ORTE_APP_CONTEXT))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &(orte_odls_globals.debugger->controls), &cnt, OPAL_UINT16))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
}
/* unpack the jobid we are to launch */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
@ -321,6 +408,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the control flags for the job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->controls, &cnt, OPAL_UINT16))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* unpack the number of app_contexts for this job */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->num_apps, &cnt, ORTE_STD_CNTR))) {
@ -1022,7 +1115,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
}
}
rc = fork_local(app, child, app->env);
rc = fork_local(app, child, app->env, ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls);
/* reaquire lock so we don't double unlock... */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
if (ORTE_SUCCESS != rc) {
@ -1051,6 +1144,27 @@ CLEANUP:
ORTE_ERROR_LOG(ret);
}
if (!launch_failed) {
/* if the launch succeeded, check to see if we need to
* co-locate any debugger daemons so that they get launched
* before we report anything to the HNP. This ensures that
* the debugger daemons are ready-to-go before mpirun returns
* from the plm.spawn command
*/
if (NULL != orte_odls_globals.debugger &&
!orte_odls_globals.debugger_launched) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch forking debugger with %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls) ? "output forwarded" : "no output"));
fork_local(orte_odls_globals.debugger->apps[0], NULL, NULL,
ORTE_JOB_CONTROL_FORWARD_OUTPUT & orte_odls_globals.debugger->controls);
orte_odls_globals.debugger_launched = true;
}
}
/* if we are the HNP, then we would rather not send this to ourselves -
* instead, we queue it up for local processing
*/
@ -1084,6 +1198,7 @@ CLEANUP:
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
}
}
}
opal_condition_signal(&orte_odls_globals.cond);

Просмотреть файл

@ -101,6 +101,7 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
ptr->jobid = ORTE_JOBID_INVALID;
ptr->apps = NULL;
ptr->num_apps = 0;
ptr->controls = 0;
ptr->total_slots_alloc = 0;
ptr->num_procs = 0;
ptr->num_local_procs = 0;
@ -168,6 +169,8 @@ int orte_odls_base_open(void)
OBJ_CONSTRUCT(&orte_odls_globals.children, opal_list_t);
OBJ_CONSTRUCT(&orte_odls_globals.jobs, opal_list_t);
orte_odls_globals.dmap = NULL;
orte_odls_globals.debugger = NULL;
orte_odls_globals.debugger_launched = false;
/* initialize and setup the daemonmap */
OBJ_CONSTRUCT(&orte_daemonmap, opal_pointer_array_t);

Просмотреть файл

@ -75,6 +75,7 @@ typedef struct orte_odls_job_t {
orte_jobid_t jobid; /* jobid for this data */
orte_app_context_t **apps; /* app_contexts for this job */
orte_std_cntr_t num_apps; /* number of app_contexts */
uint16_t controls; /* control flags for job */
orte_std_cntr_t total_slots_alloc;
orte_vpid_t num_procs;
int32_t num_local_procs;
@ -106,6 +107,10 @@ typedef struct {
opal_list_t jobs;
/* byte object to store daemon map for later xmit to procs */
opal_byte_object_t *dmap;
/* any co-spawned debugger daemon */
orte_odls_job_t *debugger;
/* debugger launched */
bool debugger_launched;
} orte_odls_globals_t;
ORTE_DECLSPEC extern orte_odls_globals_t orte_odls_globals;
@ -127,7 +132,8 @@ orte_odls_base_default_construct_child_list(opal_buffer_t *data,
/* define a function that will fork a local proc */
typedef int (*orte_odls_base_fork_local_proc_fn_t)(orte_app_context_t *context,
orte_odls_child_t *child,
char **environ_copy);
char **environ_copy,
bool forward_output);
ORTE_DECLSPEC int
orte_odls_base_default_launch_local(orte_jobid_t job,

Просмотреть файл

@ -165,16 +165,18 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
* Fork/exec the specified processes
*/
static int odls_default_fork_local_proc(
orte_app_context_t* context,
static int odls_default_fork_local_proc(orte_app_context_t* context,
orte_odls_child_t *child,
char **environ_copy)
char **environ_copy,
bool forward_output)
{
orte_iof_base_io_conf_t opts;
int rc;
sigset_t sigs;
int i, p[2];
pid_t pid;
if (NULL != child) {
/* should pull this information from MPIRUN instead of going with
default */
opts.usepty = OMPI_ENABLE_PTY_SUPPORT;
@ -183,7 +185,7 @@ static int odls_default_fork_local_proc(
part of the app_context. Do not change this without also
changing the reverse of this in
odls_default_wait_local_proc(). */
if (child->name->vpid == 0) {
if (NULL != child && child->name->vpid == 0) {
opts.connect_stdin = true;
} else {
opts.connect_stdin = false;
@ -191,10 +193,13 @@ static int odls_default_fork_local_proc(
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) {
ORTE_ERROR_LOG(rc);
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = rc;
}
return rc;
}
}
/* A pipe is used to communicate between the parent and child to
indicate whether the exec ultimately succeeded or failed. The
@ -206,27 +211,36 @@ static int odls_default_fork_local_proc(
the pipe, then the child was letting us know that it failed. */
if (pipe(p) < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
}
return ORTE_ERR_SYS_LIMITS_PIPES;
}
/* Fork off the child */
child->pid = fork();
if(child->pid < 0) {
pid = fork();
if (NULL != child) {
child->pid = pid;
}
if(pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
}
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
if (child->pid == 0) {
if (pid == 0) {
long fd, fdmax = sysconf(_SC_OPEN_MAX);
/* Setup the pipe to be close-on-exec */
close(p[0]);
fcntl(p[1], F_SETFD, FD_CLOEXEC);
if (NULL != child) {
/* setup stdout/stderr so that any error messages that we may
print out will get displayed back at orterun.
@ -245,9 +259,27 @@ static int odls_default_fork_local_proc(
}
} else if (!forward_output) {
/* tie stdin/out/err/internal to /dev/null */
int fdnull;
for (i=0; i < 3; i++) {
fdnull = open("/dev/null", O_RDONLY, 0);
if(fdnull > i) {
dup2(fdnull, i);
close(fdnull);
}
}
fdnull = open("/dev/null", O_RDONLY, 0);
if(fdnull > opts.p_internal[1]) {
dup2(fdnull, opts.p_internal[1]);
close(fdnull);
}
}
/* close all file descriptors w/ exception of
stdin/stdout/stderr and the pipe used for the IOF INTERNAL
messages */
* stdin/stdout/stderr and the pipe used for the IOF INTERNAL
* messages
*/
for(fd=3; fd<fdmax; fd++) {
if (fd != opts.p_internal[1]) {
close(fd);
@ -288,12 +320,14 @@ static int odls_default_fork_local_proc(
exit(1);
} else {
if (NULL != child && forward_output) {
/* connect endpoints IOF */
rc = orte_iof_base_setup_parent(child->name, &opts);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* Wait to read something from the pipe or close */
close(p[1]);
@ -304,9 +338,12 @@ static int odls_default_fork_local_proc(
if (errno == EINTR) {
continue;
}
/* Other errno's are bad */
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = ORTE_ERR_PIPE_READ_FAILURE;
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork got code %d back from child",
@ -326,8 +363,10 @@ static int odls_default_fork_local_proc(
failure to launch this process through the SMR or else
everyone else will hang.
*/
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = i;
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork got code %d back from child",
@ -337,10 +376,12 @@ static int odls_default_fork_local_proc(
}
}
if (NULL != child) {
/* set the proc state to LAUNCHED */
child->state = ORTE_PROC_STATE_LAUNCHED;
child->alive = true;
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -913,6 +913,11 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata)
goto CHECK_ALL_JOBS;
}
/* if this job is not to be monitored, then ignore it */
if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & jdata->controls) {
return;
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed for job %s - num_terminated %lu num_procs %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -1048,6 +1053,10 @@ CHECK_ALL_JOBS:
*/
break;
}
/* if the job is flagged to not be monitored, skip it */
if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & jobs[j]->controls) {
continue;
}
/* when checking for job termination, we must be sure to NOT check
* our own job as it - rather obviously - has NOT terminated!
*/

Просмотреть файл

@ -69,6 +69,10 @@ bool orte_allocation_required;
char *orte_launch_agent;
char **orted_cmd_line=NULL;
orte_job_t *orte_debugger_daemon=NULL;
bool orte_enable_debug_cospawn_while_running;
int orte_debugger_check_rate;
orte_trigger_event_t orte_exit, orteds_exit;
int orte_exit_status = 0;
bool orte_abnormal_term_ordered = false;

Просмотреть файл

@ -212,6 +212,10 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_node_t);
/* define a set of flags to control the launch of a job */
#define ORTE_JOB_CONTROL_LOCAL_SPAWN (uint16_t) 0x01
#define ORTE_JOB_CONTROL_NON_ORTE_JOB (uint16_t) 0x02
#define ORTE_JOB_CONTROL_DEBUGGER_DAEMON (uint16_t) 0x04
#define ORTE_JOB_CONTROL_FORWARD_OUTPUT (uint16_t) 0x08
#define ORTE_JOB_CONTROL_DO_NOT_MONITOR (uint16_t) 0x10
#define ORTE_JOB_CONTROL_FORWARD_COMM (uint16_t) 0x20
typedef struct {
/** Base object so this can be put on a list */
@ -385,6 +389,10 @@ ORTE_DECLSPEC extern bool orte_allocation_required;
ORTE_DECLSPEC extern char *orte_launch_agent;
ORTE_DECLSPEC extern char **orted_cmd_line;
ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
ORTE_DECLSPEC extern int orte_debugger_check_rate;
/* exit triggers and flags */
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit, orteds_exit;
ORTE_DECLSPEC extern int orte_exit_status;

Просмотреть файл

@ -85,6 +85,19 @@ int orte_register_params(void)
true, false, 0, &value);
orte_in_parallel_debugger = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte",
"enable_debug_cospawn_while_running",
"Whether a debugger can attach to the job "
"while it is running and request it co-locate debugger daemons (default: false)",
false, false, (int)false, &value);
orte_enable_debug_cospawn_while_running = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte",
"debugger_check_rate",
"How often (in seconds) to check if a debugger "
"has attached to a running job and requested cospawn support (default: 2 sec)",
false, false, 2, &orte_debugger_check_rate);
mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value);

Просмотреть файл

@ -107,18 +107,26 @@
#include "opal/util/argv.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "debuggers.h"
/* +++ begin MPICH/TotalView std debugger interface definitions */
#define MPIR_MAX_PATH_LENGTH 256
#define MPIR_MAX_ARG_LENGTH 1024
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
@ -131,6 +139,10 @@ int MPIR_being_debugged = 0;
volatile int MPIR_debug_state = 0;
volatile int MPIR_i_am_starter = 0;
volatile int MPIR_partial_attach_ok = 1;
volatile char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
volatile char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
volatile int MPIR_forward_output = 0;
volatile int MPIR_forward_comm = 0;
/* --- end MPICH/TotalView std debugger interface definitions */
@ -145,6 +157,7 @@ static void dump(void)
DUMP_INT(MPIR_debug_state);
DUMP_INT(MPIR_partial_attach_ok);
DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_forward_output);
DUMP_INT(MPIR_proctable_size);
fprintf(stderr, " MPIR_proctable:\n");
for (i = 0; i < MPIR_proctable_size; i++) {
@ -155,8 +168,13 @@ static void dump(void)
MPIR_proctable[i].executable_name,
MPIR_proctable[i].pid);
}
fprintf(stderr, "MPIR_executable_path: %s\n",
('\0' == MPIR_executable_path[0]) ? "NULL" : MPIR_executable_path);
fprintf(stderr, "MPIR_server_arguments: %s\n",
('\0' == MPIR_server_arguments[0]) ? "NULL" : MPIR_server_arguments);
}
/*
* Process one line from the orte_base_user_debugger MCA param and
* look for that debugger in the path. If we find it, fill in
@ -397,6 +415,11 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
opal_argv_free(lines);
/* We found one */
/* cleanup the MPIR arrays in case the debugger doesn't set them */
memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH);
memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH);
/* Set an MCA param so that everyone knows that they are being
launched under a debugger; not all debuggers are consistent
about setting MPIR_being_debugged in both the launcher and the
@ -419,6 +442,106 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
}
static void build_debugger_args(orte_app_context_t *debugger)
{
int i, j;
char mpir_arg[MPIR_MAX_ARG_LENGTH];
if ('\0' != MPIR_server_arguments[0]) {
j=0;
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
if (MPIR_server_arguments[i] == '\0') {
if (0 < j) {
opal_argv_append_nosize(&debugger->argv, mpir_arg);
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
j=0;
}
} else {
mpir_arg[j] = MPIR_server_arguments[i];
j++;
}
}
}
}
static void check_debugger(int fd, short event, void *arg)
{
struct timeval now;
opal_event_t *tmp = (opal_event_t*)arg;
orte_job_t *jdata;
orte_app_context_t *app;
char cwd[OMPI_PATH_MAX];
int rc;
if (MPIR_being_debugged) {
if (orte_debug_flag) {
opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_executable_path);
}
/* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to
* check to see if we should spawn any daemons
*/
if ('\0' != MPIR_executable_path[0]) {
/* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup
*/
jdata = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(&jdata->jobid);
/* flag the job as being debugger daemons */
jdata->controls = ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* if directed, we forward output */
if (MPIR_forward_output) {
jdata->controls |= ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* set the mapping policy to "pernode" so we only get
* one debugger daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->pernode = true;
jdata->map->npernode = 1;
/* add it to the global job pool */
opal_pointer_array_add(orte_job_data, &jdata->super);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
app->app = strdup((char*)MPIR_executable_path);
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
orte_show_help("help-orterun.txt", "orterun:init-failure",
true, "get the cwd", rc);
OBJ_RELEASE(jdata);
goto RELEASE;
}
app->cwd = strdup(cwd);
app->user_specified_cwd = false;
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(jdata->apps, &app->super);
jdata->num_apps = 1;
/* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
}
}
RELEASE:
/* notify the debugger that all is ready */
MPIR_Breakpoint();
} else {
/* reissue the timer to wake us up again */
now.tv_sec = orte_debugger_check_rate;
now.tv_usec = 0;
opal_evtimer_add(tmp, &now);
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. Before the
@ -428,11 +551,19 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
void orte_debugger_init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t **apps;
orte_app_context_t **apps, *app;
orte_std_cntr_t i;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* not being debugged */
/* not being debugged - check if we want to enable
* later attachment by debugger
*/
if (orte_enable_debug_cospawn_while_running) {
/* setup a timer to wake us up periodically
* to check for debugger attach
*/
ORTE_TIMER_EVENT(orte_debugger_check_rate, check_debugger);
}
return;
}
@ -449,6 +580,31 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
opal_setenv(env_name, "1", true, &apps[i]->env);
}
free(env_name);
/* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0]) {
/* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(&orte_debugger_daemon->jobid);
/* flag the job as being debugger daemons */
orte_debugger_daemon->controls = ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* if directed, we forward output */
if (MPIR_forward_output) {
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
opal_pointer_array_add(orte_job_data, &orte_debugger_daemon->super);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
app->app = strdup((char*)MPIR_executable_path);
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
orte_debugger_daemon->num_apps = 1;
}
}

Просмотреть файл

@ -387,6 +387,8 @@ int orterun(int argc, char *argv[])
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* always forward output from user apps */
jdata->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT;
/* Parse each app, adding it to the job object */
parse_locals(argc, argv);