1
1

Cleanup the debugger daemon co-launch code and add an ability to test it. Implement ability to co-launch debugger daemons upon attach to a running job for jobs launched under rsh, slurm, and tm environments (others can easily be added if desired).

Add new mca params to test:

orte_debugger_test_daemon: Name of the executable to be used to simulate a debugger colaunch
orte_debugger_test_attach: Test debugger colaunch after debugger attachment

To test co-launch at job start, just set the orte_debugger_test_daemon param.

To test co-launch upon attach:
set orte_debugger_test_daemon
set orte_debugger_test_attach=1
set orte_enable_debug_cospawn_while_running=1
set orte_debugger_check_rate=<N> - defines the number of seconds to wait before "checking" for a debugger attaching

Added a "debugger" program to orte/test/mpi that just spins to simulate a debugger daemon.

This commit was SVN r23144.
Этот коммит содержится в:
Ralph Castain 2010-05-14 18:44:49 +00:00
родитель e2ab4f2baf
Коммит 88f5217a12
15 изменённых файлов: 158 добавлений и 55 удалений

Просмотреть файл

@ -97,6 +97,11 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
int j; int j;
orte_daemon_cmd_flag_t command; orte_daemon_cmd_flag_t command;
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
/* all we are doing is launching debugger daemons */
goto nodemap;
}
/* get the job data pointer */ /* get the job data pointer */
if (NULL == (jdata = orte_get_job_data_object(job))) { if (NULL == (jdata = orte_get_job_data_object(job))) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
@ -191,6 +196,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
nodemap:
/* if we are not passing a regexp, then pass the nodemap */ /* if we are not passing a regexp, then pass the nodemap */
flag = 0; flag = 0;
opal_dss.pack(data, &flag, 1, OPAL_INT8); opal_dss.pack(data, &flag, 1, OPAL_INT8);
@ -308,6 +314,11 @@ pack_add_procs:
} }
} }
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
/* all we are doing is launching debugger daemons, so we are done */
return ORTE_SUCCESS;
}
/* pack the jobid so it can be extracted later */ /* pack the jobid so it can be extracted later */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -724,6 +735,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
/* unpack the jobid we are to launch */ /* unpack the jobid we are to launch */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
/* if the buffer was empty, then we know that all we are doing is
* launching debugger daemons
*/
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) {
goto done;
}
*job = ORTE_JOBID_INVALID; *job = ORTE_JOBID_INVALID;
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
@ -1013,6 +1030,7 @@ find_my_procs:
opal_condition_broadcast(&jobdat->cond); opal_condition_broadcast(&jobdat->cond);
OPAL_THREAD_UNLOCK(&jobdat->lock); OPAL_THREAD_UNLOCK(&jobdat->lock);
done:
if (NULL != app_idx) { if (NULL != app_idx) {
free(app_idx); free(app_idx);
app_idx = NULL; app_idx = NULL;

Просмотреть файл

@ -304,13 +304,17 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
gettimeofday(&app_launch_start, NULL); gettimeofday(&app_launch_start, NULL);
} }
/* find the job's data record */ if (ORTE_JOBID_INVALID == job) {
/* we are only launching debugger daemons */
jdata = orte_debugger_daemon;
} else {
if (NULL == (jdata = orte_get_job_data_object(job))) { if (NULL == (jdata = orte_get_job_data_object(job))) {
/* bad jobid */ /* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
rc = ORTE_ERR_BAD_PARAM; rc = ORTE_ERR_BAD_PARAM;
goto WAKEUP; goto WAKEUP;
} }
}
/* setup the buffer */ /* setup the buffer */
buffer = OBJ_NEW(opal_buffer_t); buffer = OBJ_NEW(opal_buffer_t);

Просмотреть файл

@ -1074,6 +1074,12 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
orte_plm_globals.spawn_status = ORTE_ERR_FATAL; orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock); OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
if (NULL == jdata) {
/* just launching debugger daemons */
active_job = ORTE_JOBID_INVALID;
goto launch_apps;
}
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave, /* if this is a request to launch a local slave,
* then we will not be launching an orted - we will * then we will not be launching an orted - we will

Просмотреть файл

@ -165,6 +165,12 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
bool failed_launch=true; bool failed_launch=true;
bool using_regexp=false; bool using_regexp=false;
if (NULL == jdata) {
/* just launching debugger daemons */
active_job = ORTE_JOBID_INVALID;
goto launch_apps;
}
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave, /* if this is a request to launch a local slave,
* then we will not be launching an orted - we will * then we will not be launching an orted - we will

Просмотреть файл

@ -156,9 +156,16 @@ static int plm_tm_launch_job(orte_job_t *jdata)
tm_event_t event; tm_event_t event;
bool failed_launch = true; bool failed_launch = true;
mode_t current_umask; mode_t current_umask;
orte_jobid_t failed_job; orte_jobid_t failed_job, active_job;
char *nodelist; char *nodelist;
if (NULL == jdata) {
/* just launching debugger daemons */
active_job = ORTE_JOBID_INVALID;
goto launch_apps;
}
active_job = jdata->jobid;
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave, /* if this is a request to launch a local slave,
* then we will not be launching an orted - we will * then we will not be launching an orted - we will
@ -420,12 +427,12 @@ launch_apps:
/* since the daemons have launched, any failures now will be for the /* since the daemons have launched, any failures now will be for the
* application job * application job
*/ */
failed_job = jdata->jobid; failed_job = active_job;
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) { if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: launch of apps failed for job %s on error %s", "%s plm:tm: launch of apps failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup; goto cleanup;
} }

Просмотреть файл

@ -87,7 +87,10 @@ orte_job_t *orte_debugger_daemon=NULL;
bool orte_enable_debug_cospawn_while_running; bool orte_enable_debug_cospawn_while_running;
int orte_debugger_check_rate; int orte_debugger_check_rate;
bool orte_output_debugger_proctable=false; bool orte_output_debugger_proctable=false;
char *orte_debugger_test_daemon=NULL;
bool orte_debugger_test_attach=false;
/* exit triggers and flags */
orte_trigger_event_t orte_exit, orteds_exit; orte_trigger_event_t orte_exit, orteds_exit;
int orte_exit_status = 0; int orte_exit_status = 0;
bool orte_abnormal_term_ordered = false; bool orte_abnormal_term_ordered = false;

Просмотреть файл

@ -582,6 +582,8 @@ ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running; ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
ORTE_DECLSPEC extern int orte_debugger_check_rate; ORTE_DECLSPEC extern int orte_debugger_check_rate;
ORTE_DECLSPEC extern bool orte_output_debugger_proctable; ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
/* exit triggers and flags */ /* exit triggers and flags */
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit; ORTE_DECLSPEC extern orte_trigger_event_t orte_exit;

Просмотреть файл

@ -167,6 +167,16 @@ int orte_register_params(void)
true, false, 0, &value); true, false, 0, &value);
orte_output_debugger_proctable = OPAL_INT_TO_BOOL(value); orte_output_debugger_proctable = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
false, false, NULL, &orte_debugger_test_daemon);
mca_base_param_reg_int_name("orte",
"debugger_test_attach",
"Test debugger colaunch after debugger attachment",
false, false, 0, &value);
orte_debugger_test_attach = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "do_not_launch", mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it", "Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value); false, false, (int)false, &value);

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger
all: $(PROGS) all: $(PROGS)

31
orte/test/mpi/debugger.c Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
/* -*- C -*-
*
* $HEADER$
*
* A program that just spins - provides mechanism for testing user-driven
* abnormal program termination
*/
#include <stdio.h>
#include <unistd.h>
int main(int argc, char* argv[])
{
int i, rc, j=0;
double pi;
pid_t pid;
pid = getpid();
printf("spin: Pid %ld\n", (long)pid);
i = 0;
while (0 == j) {
i++;
pi = i / 3.14159256;
if (i > 100) i = 0;
}
return 0;
}

Просмотреть файл

@ -9,24 +9,16 @@
#include <stdio.h> #include <stdio.h>
#include <unistd.h> #include <unistd.h>
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
int i, rc, j=0; int i, j=0;
double pi; double pi;
pid_t pid; pid_t pid;
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
fprintf(stderr, "spin: couldn't init orte - error code %d\n", rc);
return rc;
}
pid = getpid(); pid = getpid();
printf("spin: Name %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid); printf("spin: Pid %ld\n", (long)pid);
i = 0; i = 0;
while (0 == j) { while (0 == j) {

Просмотреть файл

@ -477,63 +477,60 @@ static void check_debugger(int fd, short event, void *arg)
{ {
struct timeval now; struct timeval now;
opal_event_t *tmp = (opal_event_t*)arg; opal_event_t *tmp = (opal_event_t*)arg;
orte_job_t *jdata;
orte_app_context_t *app; orte_app_context_t *app;
char cwd[OPAL_PATH_MAX];
int rc; int rc;
int32_t ljob; int32_t ljob;
if (MPIR_being_debugged) { if (MPIR_being_debugged || orte_debugger_test_attach) {
if (orte_debug_flag) { if (orte_debug_flag) {
opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_executable_path); (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
} }
/* a debugger has attached! All the MPIR_Proctable /* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to * data is already available, so we only need to
* check to see if we should spawn any daemons * check to see if we should spawn any daemons
*/ */
if ('\0' != MPIR_executable_path[0]) { if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
goto RELEASE;
}
/* this will be launched just like a regular job, /* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon * so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup * as this is reserved for co-location upon startup
*/ */
jdata = OBJ_NEW(orte_job_t); orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely /* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping * to avoid confusing the rest of the system's bookkeeping
*/ */
orte_plm_base_create_jobid(jdata); orte_plm_base_create_jobid(orte_debugger_daemon);
/* flag the job as being debugger daemons */ /* flag the job as being debugger daemons */
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON; orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */ /* unless directed, we do not forward output */
if (!MPIR_forward_output) { if (!MPIR_forward_output) {
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT; orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
} }
/* set the mapping policy to "pernode" so we only get orte_debugger_daemon->num_procs = orte_process_info.num_procs;
* one debugger daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->npernode = 1;
/* add it to the global job pool */ /* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid); ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata); opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */ /* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t); app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path); app->app = strdup((char*)MPIR_executable_path);
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
orte_show_help("help-orterun.txt", "orterun:init-failure",
true, "get the cwd", rc);
OBJ_RELEASE(jdata);
goto RELEASE;
} }
app->cwd = strdup(cwd);
app->user_specified_cwd = false;
opal_argv_append_nosize(&app->argv, app->app); opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app); build_debugger_args(app);
opal_pointer_array_add(jdata->apps, &app->super); opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
jdata->num_apps = 1; orte_debugger_daemon->num_apps = 1;
/* now go ahead and spawn this job */ /* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { if (ORTE_SUCCESS != (rc = orte_plm.spawn(NULL))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
} }
} }
@ -557,7 +554,7 @@ static void check_debugger(int fd, short event, void *arg)
* spawn we need to check if we are being run under a TotalView-like * spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter. * debugger; if so then inform applications via an MCA parameter.
*/ */
void orte_debugger_init_before_spawn(orte_job_t *jdata) int orte_debugger_init_before_spawn(orte_job_t *jdata)
{ {
char *env_name; char *env_name;
orte_app_context_t *app; orte_app_context_t *app;
@ -573,10 +570,18 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
* to check for debugger attach * to check for debugger attach
*/ */
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger); ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger);
return ORTE_SUCCESS;
} }
return; /* if we were given a test debugger, then we still want to
* colaunch it
*/
if (NULL != orte_debugger_test_daemon) {
goto launchit;
}
return ORTE_SUCCESS;
} }
launchit:
if (orte_debug_flag) { if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger"); opal_output(0, "Info: Spawned by a debugger");
} }
@ -594,7 +599,15 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
free(env_name); free(env_name);
/* check if we need to co-spawn the debugger daemons */ /* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0]) { if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
return ORTE_ERROR;
}
/* add debugger info to launch message */ /* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t); orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely /* create a jobid for these daemons - this is done solely
@ -612,12 +625,17 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon); opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */ /* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t); app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path); app->app = strdup((char*)MPIR_executable_path);
}
opal_argv_append_nosize(&app->argv, app->app); opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app); build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super); opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
orte_debugger_daemon->num_apps = 1; orte_debugger_daemon->num_apps = 1;
} }
return ORTE_SUCCESS;
} }

Просмотреть файл

@ -28,7 +28,7 @@ BEGIN_C_DECLS
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line, void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__; int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
void orte_debugger_init_before_spawn(orte_job_t *jdata); int orte_debugger_init_before_spawn(orte_job_t *jdata);
void orte_debugger_init_after_spawn(orte_job_t *jdata); void orte_debugger_init_after_spawn(orte_job_t *jdata);
void orte_debugger_finalize(void); void orte_debugger_finalize(void);

Просмотреть файл

@ -809,7 +809,9 @@ int orterun(int argc, char *argv[])
} }
/* setup for debugging */ /* setup for debugging */
orte_debugger_init_before_spawn(jdata); if (ORTE_SUCCESS != orte_debugger_init_before_spawn(jdata)) {
goto DONE;
}
/* Spawn the job */ /* Spawn the job */
rc = orte_plm.spawn(jdata); rc = orte_plm.spawn(jdata);

Просмотреть файл

@ -627,6 +627,10 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
if (NULL == jdata->map) { if (NULL == jdata->map) {
continue; continue;
} }
/* if this is a debugger job, ignore it */
if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) {
continue;
}
/* pack the jobid */ /* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);