1
1

Cleanup the debugger daemon co-launch code and add an ability to test it. Implement ability to co-launch debugger daemons upon attach to a running job for jobs launched under rsh, slurm, and tm environments (others can easily be added if desired).

Add new mca params to test:

orte_debugger_test_daemon: Name of the executable to be used to simulate a debugger colaunch
orte_debugger_test_attach: Test debugger colaunch after debugger attachment

To test co-launch at job start, just set the orte_debugger_test_daemon param.

To test co-launch upon attach:
set orte_debugger_test_daemon
set orte_debugger_test_attach=1
set orte_enable_debug_cospawn_while_running=1
set orte_debugger_check_rate=<N> - defines the number of seconds to wait before "checking" for a debugger attaching

Added a "debugger" program to orte/test/mpi that just spins to simulate a debugger daemon.

This commit was SVN r23144.
Этот коммит содержится в:
Ralph Castain 2010-05-14 18:44:49 +00:00
родитель e2ab4f2baf
Коммит 88f5217a12
15 изменённых файлов: 158 добавлений и 55 удалений

Просмотреть файл

@ -97,6 +97,11 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
int j;
orte_daemon_cmd_flag_t command;
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
/* all we are doing is launching debugger daemons */
goto nodemap;
}
/* get the job data pointer */
if (NULL == (jdata = orte_get_job_data_object(job))) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
@ -191,6 +196,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return ORTE_SUCCESS;
}
nodemap:
/* if we are not passing a regexp, then pass the nodemap */
flag = 0;
opal_dss.pack(data, &flag, 1, OPAL_INT8);
@ -308,6 +314,11 @@ pack_add_procs:
}
}
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
/* all we are doing is launching debugger daemons, so we are done */
return ORTE_SUCCESS;
}
/* pack the jobid so it can be extracted later */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
@ -724,6 +735,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
/* unpack the jobid we are to launch */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
/* if the buffer was empty, then we know that all we are doing is
* launching debugger daemons
*/
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) {
goto done;
}
*job = ORTE_JOBID_INVALID;
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
@ -1013,6 +1030,7 @@ find_my_procs:
opal_condition_broadcast(&jobdat->cond);
OPAL_THREAD_UNLOCK(&jobdat->lock);
done:
if (NULL != app_idx) {
free(app_idx);
app_idx = NULL;

Просмотреть файл

@ -304,12 +304,16 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
gettimeofday(&app_launch_start, NULL);
}
/* find the job's data record */
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
rc = ORTE_ERR_BAD_PARAM;
goto WAKEUP;
if (ORTE_JOBID_INVALID == job) {
/* we are only launching debugger daemons */
jdata = orte_debugger_daemon;
} else {
if (NULL == (jdata = orte_get_job_data_object(job))) {
/* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
rc = ORTE_ERR_BAD_PARAM;
goto WAKEUP;
}
}
/* setup the buffer */

Просмотреть файл

@ -1074,6 +1074,12 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
if (NULL == jdata) {
/* just launching debugger daemons */
active_job = ORTE_JOBID_INVALID;
goto launch_apps;
}
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave,
* then we will not be launching an orted - we will

Просмотреть файл

@ -165,6 +165,12 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
bool failed_launch=true;
bool using_regexp=false;
if (NULL == jdata) {
/* just launching debugger daemons */
active_job = ORTE_JOBID_INVALID;
goto launch_apps;
}
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave,
* then we will not be launching an orted - we will

Просмотреть файл

@ -156,9 +156,16 @@ static int plm_tm_launch_job(orte_job_t *jdata)
tm_event_t event;
bool failed_launch = true;
mode_t current_umask;
orte_jobid_t failed_job;
orte_jobid_t failed_job, active_job;
char *nodelist;
if (NULL == jdata) {
/* just launching debugger daemons */
active_job = ORTE_JOBID_INVALID;
goto launch_apps;
}
active_job = jdata->jobid;
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave,
* then we will not be launching an orted - we will
@ -420,12 +427,12 @@ launch_apps:
/* since the daemons have launched, any failures now will be for the
* application job
*/
failed_job = jdata->jobid;
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
failed_job = active_job;
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:tm: launch of apps failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup;
}

Просмотреть файл

@ -87,7 +87,10 @@ orte_job_t *orte_debugger_daemon=NULL;
bool orte_enable_debug_cospawn_while_running;
int orte_debugger_check_rate;
bool orte_output_debugger_proctable=false;
char *orte_debugger_test_daemon=NULL;
bool orte_debugger_test_attach=false;
/* exit triggers and flags */
orte_trigger_event_t orte_exit, orteds_exit;
int orte_exit_status = 0;
bool orte_abnormal_term_ordered = false;

Просмотреть файл

@ -582,6 +582,8 @@ ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
ORTE_DECLSPEC extern int orte_debugger_check_rate;
ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
/* exit triggers and flags */
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit;

Просмотреть файл

@ -167,6 +167,16 @@ int orte_register_params(void)
true, false, 0, &value);
orte_output_debugger_proctable = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
false, false, NULL, &orte_debugger_test_daemon);
mca_base_param_reg_int_name("orte",
"debugger_test_attach",
"Test debugger colaunch after debugger attachment",
false, false, 0, &value);
orte_debugger_test_attach = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value);

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger
all: $(PROGS)

31
orte/test/mpi/debugger.c Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
/* -*- C -*-
*
* $HEADER$
*
* A program that just spins - provides mechanism for testing user-driven
* abnormal program termination
*/
#include <stdio.h>
#include <unistd.h>
int main(int argc, char* argv[])
{
int i, rc, j=0;
double pi;
pid_t pid;
pid = getpid();
printf("spin: Pid %ld\n", (long)pid);
i = 0;
while (0 == j) {
i++;
pi = i / 3.14159256;
if (i > 100) i = 0;
}
return 0;
}

Просмотреть файл

@ -9,24 +9,16 @@
#include <stdio.h>
#include <unistd.h>
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h"
int main(int argc, char* argv[])
{
int i, rc, j=0;
int i, j=0;
double pi;
pid_t pid;
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
fprintf(stderr, "spin: couldn't init orte - error code %d\n", rc);
return rc;
}
pid = getpid();
printf("spin: Name %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid);
printf("spin: Pid %ld\n", (long)pid);
i = 0;
while (0 == j) {

Просмотреть файл

@ -477,63 +477,60 @@ static void check_debugger(int fd, short event, void *arg)
{
struct timeval now;
opal_event_t *tmp = (opal_event_t*)arg;
orte_job_t *jdata;
orte_app_context_t *app;
char cwd[OPAL_PATH_MAX];
int rc;
int32_t ljob;
if (MPIR_being_debugged) {
if (MPIR_being_debugged || orte_debugger_test_attach) {
if (orte_debug_flag) {
opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_executable_path);
(NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
}
/* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to
* check to see if we should spawn any daemons
*/
if ('\0' != MPIR_executable_path[0]) {
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
goto RELEASE;
}
/* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup
*/
jdata = OBJ_NEW(orte_job_t);
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(jdata);
orte_plm_base_create_jobid(orte_debugger_daemon);
/* flag the job as being debugger daemons */
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* set the mapping policy to "pernode" so we only get
* one debugger daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->npernode = 1;
orte_debugger_daemon->num_procs = orte_process_info.num_procs;
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
app->app = strdup((char*)MPIR_executable_path);
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
orte_show_help("help-orterun.txt", "orterun:init-failure",
true, "get the cwd", rc);
OBJ_RELEASE(jdata);
goto RELEASE;
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
app->cwd = strdup(cwd);
app->user_specified_cwd = false;
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(jdata->apps, &app->super);
jdata->num_apps = 1;
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
orte_debugger_daemon->num_apps = 1;
/* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
if (ORTE_SUCCESS != (rc = orte_plm.spawn(NULL))) {
ORTE_ERROR_LOG(rc);
}
}
@ -557,7 +554,7 @@ static void check_debugger(int fd, short event, void *arg)
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void orte_debugger_init_before_spawn(orte_job_t *jdata)
int orte_debugger_init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
@ -573,14 +570,22 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
* to check for debugger attach
*/
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger);
return ORTE_SUCCESS;
}
return;
/* if we were given a test debugger, then we still want to
* colaunch it
*/
if (NULL != orte_debugger_test_daemon) {
goto launchit;
}
return ORTE_SUCCESS;
}
launchit:
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
@ -594,7 +599,15 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
free(env_name);
/* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0]) {
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
return ORTE_ERROR;
}
/* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
@ -612,12 +625,17 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
app->app = strdup((char*)MPIR_executable_path);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
orte_debugger_daemon->num_apps = 1;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -28,7 +28,7 @@ BEGIN_C_DECLS
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
void orte_debugger_init_before_spawn(orte_job_t *jdata);
int orte_debugger_init_before_spawn(orte_job_t *jdata);
void orte_debugger_init_after_spawn(orte_job_t *jdata);
void orte_debugger_finalize(void);

Просмотреть файл

@ -809,7 +809,9 @@ int orterun(int argc, char *argv[])
}
/* setup for debugging */
orte_debugger_init_before_spawn(jdata);
if (ORTE_SUCCESS != orte_debugger_init_before_spawn(jdata)) {
goto DONE;
}
/* Spawn the job */
rc = orte_plm.spawn(jdata);

Просмотреть файл

@ -627,6 +627,10 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
if (NULL == jdata->map) {
continue;
}
/* if this is a debugger job, ignore it */
if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) {
continue;
}
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);