Cleanup the debugger daemon co-launch code and add an ability to test it. Implement ability to co-launch debugger daemons upon attach to a running job for jobs launched under rsh, slurm, and tm environments (others can easily be added if desired).
Add new mca params to test: orte_debugger_test_daemon: Name of the executable to be used to simulate a debugger colaunch orte_debugger_test_attach: Test debugger colaunch after debugger attachment To test co-launch at job start, just set the orte_debugger_test_daemon param. To test co-launch upon attach: set orte_debugger_test_daemon set orte_debugger_test_attach=1 set orte_enable_debug_cospawn_while_running=1 set orte_debugger_check_rate=<N> - defines the number of seconds to wait before "checking" for a debugger attaching Added a "debugger" program to orte/test/mpi that just spins to simulate a debugger daemon. This commit was SVN r23144.
Этот коммит содержится в:
родитель
e2ab4f2baf
Коммит
88f5217a12
@ -97,6 +97,11 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
|||||||
int j;
|
int j;
|
||||||
orte_daemon_cmd_flag_t command;
|
orte_daemon_cmd_flag_t command;
|
||||||
|
|
||||||
|
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
|
||||||
|
/* all we are doing is launching debugger daemons */
|
||||||
|
goto nodemap;
|
||||||
|
}
|
||||||
|
|
||||||
/* get the job data pointer */
|
/* get the job data pointer */
|
||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||||
@ -191,6 +196,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nodemap:
|
||||||
/* if we are not passing a regexp, then pass the nodemap */
|
/* if we are not passing a regexp, then pass the nodemap */
|
||||||
flag = 0;
|
flag = 0;
|
||||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||||
@ -308,6 +314,11 @@ pack_add_procs:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
|
||||||
|
/* all we are doing is launching debugger daemons, so we are done */
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/* pack the jobid so it can be extracted later */
|
/* pack the jobid so it can be extracted later */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -724,6 +735,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
|||||||
/* unpack the jobid we are to launch */
|
/* unpack the jobid we are to launch */
|
||||||
cnt=1;
|
cnt=1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
|
||||||
|
/* if the buffer was empty, then we know that all we are doing is
|
||||||
|
* launching debugger daemons
|
||||||
|
*/
|
||||||
|
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) {
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
*job = ORTE_JOBID_INVALID;
|
*job = ORTE_JOBID_INVALID;
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto REPORT_ERROR;
|
goto REPORT_ERROR;
|
||||||
@ -1013,6 +1030,7 @@ find_my_procs:
|
|||||||
opal_condition_broadcast(&jobdat->cond);
|
opal_condition_broadcast(&jobdat->cond);
|
||||||
OPAL_THREAD_UNLOCK(&jobdat->lock);
|
OPAL_THREAD_UNLOCK(&jobdat->lock);
|
||||||
|
|
||||||
|
done:
|
||||||
if (NULL != app_idx) {
|
if (NULL != app_idx) {
|
||||||
free(app_idx);
|
free(app_idx);
|
||||||
app_idx = NULL;
|
app_idx = NULL;
|
||||||
|
@ -304,13 +304,17 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
|||||||
gettimeofday(&app_launch_start, NULL);
|
gettimeofday(&app_launch_start, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* find the job's data record */
|
if (ORTE_JOBID_INVALID == job) {
|
||||||
|
/* we are only launching debugger daemons */
|
||||||
|
jdata = orte_debugger_daemon;
|
||||||
|
} else {
|
||||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||||
/* bad jobid */
|
/* bad jobid */
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||||
rc = ORTE_ERR_BAD_PARAM;
|
rc = ORTE_ERR_BAD_PARAM;
|
||||||
goto WAKEUP;
|
goto WAKEUP;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* setup the buffer */
|
/* setup the buffer */
|
||||||
buffer = OBJ_NEW(opal_buffer_t);
|
buffer = OBJ_NEW(opal_buffer_t);
|
||||||
|
@ -1074,6 +1074,12 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
|||||||
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
|
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
|
||||||
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||||
|
|
||||||
|
if (NULL == jdata) {
|
||||||
|
/* just launching debugger daemons */
|
||||||
|
active_job = ORTE_JOBID_INVALID;
|
||||||
|
goto launch_apps;
|
||||||
|
}
|
||||||
|
|
||||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||||
/* if this is a request to launch a local slave,
|
/* if this is a request to launch a local slave,
|
||||||
* then we will not be launching an orted - we will
|
* then we will not be launching an orted - we will
|
||||||
|
@ -165,6 +165,12 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
|||||||
bool failed_launch=true;
|
bool failed_launch=true;
|
||||||
bool using_regexp=false;
|
bool using_regexp=false;
|
||||||
|
|
||||||
|
if (NULL == jdata) {
|
||||||
|
/* just launching debugger daemons */
|
||||||
|
active_job = ORTE_JOBID_INVALID;
|
||||||
|
goto launch_apps;
|
||||||
|
}
|
||||||
|
|
||||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||||
/* if this is a request to launch a local slave,
|
/* if this is a request to launch a local slave,
|
||||||
* then we will not be launching an orted - we will
|
* then we will not be launching an orted - we will
|
||||||
|
@ -156,9 +156,16 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
|||||||
tm_event_t event;
|
tm_event_t event;
|
||||||
bool failed_launch = true;
|
bool failed_launch = true;
|
||||||
mode_t current_umask;
|
mode_t current_umask;
|
||||||
orte_jobid_t failed_job;
|
orte_jobid_t failed_job, active_job;
|
||||||
char *nodelist;
|
char *nodelist;
|
||||||
|
|
||||||
|
if (NULL == jdata) {
|
||||||
|
/* just launching debugger daemons */
|
||||||
|
active_job = ORTE_JOBID_INVALID;
|
||||||
|
goto launch_apps;
|
||||||
|
}
|
||||||
|
active_job = jdata->jobid;
|
||||||
|
|
||||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||||
/* if this is a request to launch a local slave,
|
/* if this is a request to launch a local slave,
|
||||||
* then we will not be launching an orted - we will
|
* then we will not be launching an orted - we will
|
||||||
@ -420,12 +427,12 @@ launch_apps:
|
|||||||
/* since the daemons have launched, any failures now will be for the
|
/* since the daemons have launched, any failures now will be for the
|
||||||
* application job
|
* application job
|
||||||
*/
|
*/
|
||||||
failed_job = jdata->jobid;
|
failed_job = active_job;
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||||
"%s plm:tm: launch of apps failed for job %s on error %s",
|
"%s plm:tm: launch of apps failed for job %s on error %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,7 +87,10 @@ orte_job_t *orte_debugger_daemon=NULL;
|
|||||||
bool orte_enable_debug_cospawn_while_running;
|
bool orte_enable_debug_cospawn_while_running;
|
||||||
int orte_debugger_check_rate;
|
int orte_debugger_check_rate;
|
||||||
bool orte_output_debugger_proctable=false;
|
bool orte_output_debugger_proctable=false;
|
||||||
|
char *orte_debugger_test_daemon=NULL;
|
||||||
|
bool orte_debugger_test_attach=false;
|
||||||
|
|
||||||
|
/* exit triggers and flags */
|
||||||
orte_trigger_event_t orte_exit, orteds_exit;
|
orte_trigger_event_t orte_exit, orteds_exit;
|
||||||
int orte_exit_status = 0;
|
int orte_exit_status = 0;
|
||||||
bool orte_abnormal_term_ordered = false;
|
bool orte_abnormal_term_ordered = false;
|
||||||
|
@ -582,6 +582,8 @@ ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
|
|||||||
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
|
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
|
||||||
ORTE_DECLSPEC extern int orte_debugger_check_rate;
|
ORTE_DECLSPEC extern int orte_debugger_check_rate;
|
||||||
ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
|
ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
|
||||||
|
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
|
||||||
|
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
|
||||||
|
|
||||||
/* exit triggers and flags */
|
/* exit triggers and flags */
|
||||||
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit;
|
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit;
|
||||||
|
@ -167,6 +167,16 @@ int orte_register_params(void)
|
|||||||
true, false, 0, &value);
|
true, false, 0, &value);
|
||||||
orte_output_debugger_proctable = OPAL_INT_TO_BOOL(value);
|
orte_output_debugger_proctable = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
|
||||||
|
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
|
||||||
|
false, false, NULL, &orte_debugger_test_daemon);
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte",
|
||||||
|
"debugger_test_attach",
|
||||||
|
"Test debugger colaunch after debugger attachment",
|
||||||
|
false, false, 0, &value);
|
||||||
|
orte_debugger_test_attach = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
mca_base_param_reg_int_name("orte", "do_not_launch",
|
mca_base_param_reg_int_name("orte", "do_not_launch",
|
||||||
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
|
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
|
||||||
false, false, (int)false, &value);
|
false, false, (int)false, &value);
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort
|
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger
|
||||||
|
|
||||||
all: $(PROGS)
|
all: $(PROGS)
|
||||||
|
|
||||||
|
31
orte/test/mpi/debugger.c
Обычный файл
31
orte/test/mpi/debugger.c
Обычный файл
@ -0,0 +1,31 @@
|
|||||||
|
/* -*- C -*-
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*
|
||||||
|
* A program that just spins - provides mechanism for testing user-driven
|
||||||
|
* abnormal program termination
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
|
||||||
|
int i, rc, j=0;
|
||||||
|
double pi;
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
|
pid = getpid();
|
||||||
|
|
||||||
|
printf("spin: Pid %ld\n", (long)pid);
|
||||||
|
|
||||||
|
i = 0;
|
||||||
|
while (0 == j) {
|
||||||
|
i++;
|
||||||
|
pi = i / 3.14159256;
|
||||||
|
if (i > 100) i = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -9,24 +9,16 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
|
||||||
#include "orte/runtime/runtime.h"
|
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
|
|
||||||
int i, rc, j=0;
|
int i, j=0;
|
||||||
double pi;
|
double pi;
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
|
|
||||||
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
|
|
||||||
fprintf(stderr, "spin: couldn't init orte - error code %d\n", rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
pid = getpid();
|
pid = getpid();
|
||||||
|
|
||||||
printf("spin: Name %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid);
|
printf("spin: Pid %ld\n", (long)pid);
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
while (0 == j) {
|
while (0 == j) {
|
||||||
|
@ -477,63 +477,60 @@ static void check_debugger(int fd, short event, void *arg)
|
|||||||
{
|
{
|
||||||
struct timeval now;
|
struct timeval now;
|
||||||
opal_event_t *tmp = (opal_event_t*)arg;
|
opal_event_t *tmp = (opal_event_t*)arg;
|
||||||
orte_job_t *jdata;
|
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
char cwd[OPAL_PATH_MAX];
|
|
||||||
int rc;
|
int rc;
|
||||||
int32_t ljob;
|
int32_t ljob;
|
||||||
|
|
||||||
if (MPIR_being_debugged) {
|
if (MPIR_being_debugged || orte_debugger_test_attach) {
|
||||||
if (orte_debug_flag) {
|
if (orte_debug_flag) {
|
||||||
opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
MPIR_executable_path);
|
(NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* a debugger has attached! All the MPIR_Proctable
|
/* a debugger has attached! All the MPIR_Proctable
|
||||||
* data is already available, so we only need to
|
* data is already available, so we only need to
|
||||||
* check to see if we should spawn any daemons
|
* check to see if we should spawn any daemons
|
||||||
*/
|
*/
|
||||||
if ('\0' != MPIR_executable_path[0]) {
|
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
|
||||||
|
/* can only have one debugger */
|
||||||
|
if (NULL != orte_debugger_daemon) {
|
||||||
|
opal_output(0, "-------------------------------------------\n"
|
||||||
|
"Only one debugger can be used on a job.\n"
|
||||||
|
"-------------------------------------------\n");
|
||||||
|
goto RELEASE;
|
||||||
|
}
|
||||||
/* this will be launched just like a regular job,
|
/* this will be launched just like a regular job,
|
||||||
* so we do not use the global orte_debugger_daemon
|
* so we do not use the global orte_debugger_daemon
|
||||||
* as this is reserved for co-location upon startup
|
* as this is reserved for co-location upon startup
|
||||||
*/
|
*/
|
||||||
jdata = OBJ_NEW(orte_job_t);
|
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
||||||
/* create a jobid for these daemons - this is done solely
|
/* create a jobid for these daemons - this is done solely
|
||||||
* to avoid confusing the rest of the system's bookkeeping
|
* to avoid confusing the rest of the system's bookkeeping
|
||||||
*/
|
*/
|
||||||
orte_plm_base_create_jobid(jdata);
|
orte_plm_base_create_jobid(orte_debugger_daemon);
|
||||||
/* flag the job as being debugger daemons */
|
/* flag the job as being debugger daemons */
|
||||||
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
||||||
/* unless directed, we do not forward output */
|
/* unless directed, we do not forward output */
|
||||||
if (!MPIR_forward_output) {
|
if (!MPIR_forward_output) {
|
||||||
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||||
}
|
}
|
||||||
/* set the mapping policy to "pernode" so we only get
|
orte_debugger_daemon->num_procs = orte_process_info.num_procs;
|
||||||
* one debugger daemon on each node
|
|
||||||
*/
|
|
||||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
|
||||||
jdata->map->npernode = 1;
|
|
||||||
/* add it to the global job pool */
|
/* add it to the global job pool */
|
||||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
|
||||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
|
||||||
/* create an app_context for the debugger daemon */
|
/* create an app_context for the debugger daemon */
|
||||||
app = OBJ_NEW(orte_app_context_t);
|
app = OBJ_NEW(orte_app_context_t);
|
||||||
|
if (NULL != orte_debugger_test_daemon) {
|
||||||
|
app->app = strdup(orte_debugger_test_daemon);
|
||||||
|
} else {
|
||||||
app->app = strdup((char*)MPIR_executable_path);
|
app->app = strdup((char*)MPIR_executable_path);
|
||||||
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
|
|
||||||
orte_show_help("help-orterun.txt", "orterun:init-failure",
|
|
||||||
true, "get the cwd", rc);
|
|
||||||
OBJ_RELEASE(jdata);
|
|
||||||
goto RELEASE;
|
|
||||||
}
|
}
|
||||||
app->cwd = strdup(cwd);
|
|
||||||
app->user_specified_cwd = false;
|
|
||||||
opal_argv_append_nosize(&app->argv, app->app);
|
opal_argv_append_nosize(&app->argv, app->app);
|
||||||
build_debugger_args(app);
|
build_debugger_args(app);
|
||||||
opal_pointer_array_add(jdata->apps, &app->super);
|
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
|
||||||
jdata->num_apps = 1;
|
orte_debugger_daemon->num_apps = 1;
|
||||||
/* now go ahead and spawn this job */
|
/* now go ahead and spawn this job */
|
||||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
if (ORTE_SUCCESS != (rc = orte_plm.spawn(NULL))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -557,7 +554,7 @@ static void check_debugger(int fd, short event, void *arg)
|
|||||||
* spawn we need to check if we are being run under a TotalView-like
|
* spawn we need to check if we are being run under a TotalView-like
|
||||||
* debugger; if so then inform applications via an MCA parameter.
|
* debugger; if so then inform applications via an MCA parameter.
|
||||||
*/
|
*/
|
||||||
void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
int orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||||
{
|
{
|
||||||
char *env_name;
|
char *env_name;
|
||||||
orte_app_context_t *app;
|
orte_app_context_t *app;
|
||||||
@ -573,10 +570,18 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
|||||||
* to check for debugger attach
|
* to check for debugger attach
|
||||||
*/
|
*/
|
||||||
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger);
|
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger);
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
return;
|
/* if we were given a test debugger, then we still want to
|
||||||
|
* colaunch it
|
||||||
|
*/
|
||||||
|
if (NULL != orte_debugger_test_daemon) {
|
||||||
|
goto launchit;
|
||||||
|
}
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
launchit:
|
||||||
if (orte_debug_flag) {
|
if (orte_debug_flag) {
|
||||||
opal_output(0, "Info: Spawned by a debugger");
|
opal_output(0, "Info: Spawned by a debugger");
|
||||||
}
|
}
|
||||||
@ -594,7 +599,15 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
|||||||
free(env_name);
|
free(env_name);
|
||||||
|
|
||||||
/* check if we need to co-spawn the debugger daemons */
|
/* check if we need to co-spawn the debugger daemons */
|
||||||
if ('\0' != MPIR_executable_path[0]) {
|
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
|
||||||
|
/* can only have one debugger */
|
||||||
|
if (NULL != orte_debugger_daemon) {
|
||||||
|
opal_output(0, "-------------------------------------------\n"
|
||||||
|
"Only one debugger can be used on a job.\n"
|
||||||
|
"-------------------------------------------\n");
|
||||||
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
/* add debugger info to launch message */
|
/* add debugger info to launch message */
|
||||||
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
||||||
/* create a jobid for these daemons - this is done solely
|
/* create a jobid for these daemons - this is done solely
|
||||||
@ -612,12 +625,17 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
|||||||
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
|
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
|
||||||
/* create an app_context for the debugger daemon */
|
/* create an app_context for the debugger daemon */
|
||||||
app = OBJ_NEW(orte_app_context_t);
|
app = OBJ_NEW(orte_app_context_t);
|
||||||
|
if (NULL != orte_debugger_test_daemon) {
|
||||||
|
app->app = strdup(orte_debugger_test_daemon);
|
||||||
|
} else {
|
||||||
app->app = strdup((char*)MPIR_executable_path);
|
app->app = strdup((char*)MPIR_executable_path);
|
||||||
|
}
|
||||||
opal_argv_append_nosize(&app->argv, app->app);
|
opal_argv_append_nosize(&app->argv, app->app);
|
||||||
build_debugger_args(app);
|
build_debugger_args(app);
|
||||||
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
|
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
|
||||||
orte_debugger_daemon->num_apps = 1;
|
orte_debugger_daemon->num_apps = 1;
|
||||||
}
|
}
|
||||||
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ BEGIN_C_DECLS
|
|||||||
|
|
||||||
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||||
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
|
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
|
||||||
void orte_debugger_init_before_spawn(orte_job_t *jdata);
|
int orte_debugger_init_before_spawn(orte_job_t *jdata);
|
||||||
void orte_debugger_init_after_spawn(orte_job_t *jdata);
|
void orte_debugger_init_after_spawn(orte_job_t *jdata);
|
||||||
void orte_debugger_finalize(void);
|
void orte_debugger_finalize(void);
|
||||||
|
|
||||||
|
@ -809,7 +809,9 @@ int orterun(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* setup for debugging */
|
/* setup for debugging */
|
||||||
orte_debugger_init_before_spawn(jdata);
|
if (ORTE_SUCCESS != orte_debugger_init_before_spawn(jdata)) {
|
||||||
|
goto DONE;
|
||||||
|
}
|
||||||
|
|
||||||
/* Spawn the job */
|
/* Spawn the job */
|
||||||
rc = orte_plm.spawn(jdata);
|
rc = orte_plm.spawn(jdata);
|
||||||
|
@ -627,6 +627,10 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
|||||||
if (NULL == jdata->map) {
|
if (NULL == jdata->map) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
/* if this is a debugger job, ignore it */
|
||||||
|
if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
/* pack the jobid */
|
/* pack the jobid */
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user