Cleanup the debugger daemon co-launch code and add an ability to test it. Implement ability to co-launch debugger daemons upon attach to a running job for jobs launched under rsh, slurm, and tm environments (others can easily be added if desired).
Add new mca params to test: orte_debugger_test_daemon: Name of the executable to be used to simulate a debugger colaunch orte_debugger_test_attach: Test debugger colaunch after debugger attachment To test co-launch at job start, just set the orte_debugger_test_daemon param. To test co-launch upon attach: set orte_debugger_test_daemon set orte_debugger_test_attach=1 set orte_enable_debug_cospawn_while_running=1 set orte_debugger_check_rate=<N> - defines the number of seconds to wait before "checking" for a debugger attaching Added a "debugger" program to orte/test/mpi that just spins to simulate a debugger daemon. This commit was SVN r23144.
Этот коммит содержится в:
родитель
e2ab4f2baf
Коммит
88f5217a12
@ -97,6 +97,11 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
int j;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
|
||||
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
|
||||
/* all we are doing is launching debugger daemons */
|
||||
goto nodemap;
|
||||
}
|
||||
|
||||
/* get the job data pointer */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
@ -191,6 +196,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
nodemap:
|
||||
/* if we are not passing a regexp, then pass the nodemap */
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
@ -308,6 +314,11 @@ pack_add_procs:
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != orte_debugger_daemon && ORTE_JOBID_INVALID == job) {
|
||||
/* all we are doing is launching debugger daemons, so we are done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* pack the jobid so it can be extracted later */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -724,6 +735,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
/* unpack the jobid we are to launch */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
|
||||
/* if the buffer was empty, then we know that all we are doing is
|
||||
* launching debugger daemons
|
||||
*/
|
||||
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) {
|
||||
goto done;
|
||||
}
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
@ -1013,6 +1030,7 @@ find_my_procs:
|
||||
opal_condition_broadcast(&jobdat->cond);
|
||||
OPAL_THREAD_UNLOCK(&jobdat->lock);
|
||||
|
||||
done:
|
||||
if (NULL != app_idx) {
|
||||
free(app_idx);
|
||||
app_idx = NULL;
|
||||
|
@ -304,12 +304,16 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
gettimeofday(&app_launch_start, NULL);
|
||||
}
|
||||
|
||||
/* find the job's data record */
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/* bad jobid */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto WAKEUP;
|
||||
if (ORTE_JOBID_INVALID == job) {
|
||||
/* we are only launching debugger daemons */
|
||||
jdata = orte_debugger_daemon;
|
||||
} else {
|
||||
if (NULL == (jdata = orte_get_job_data_object(job))) {
|
||||
/* bad jobid */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto WAKEUP;
|
||||
}
|
||||
}
|
||||
|
||||
/* setup the buffer */
|
||||
|
@ -1074,6 +1074,12 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
|
||||
orte_plm_globals.spawn_status = ORTE_ERR_FATAL;
|
||||
OPAL_THREAD_UNLOCK(&orte_plm_globals.spawn_lock);
|
||||
|
||||
if (NULL == jdata) {
|
||||
/* just launching debugger daemons */
|
||||
active_job = ORTE_JOBID_INVALID;
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
|
@ -165,6 +165,12 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
bool failed_launch=true;
|
||||
bool using_regexp=false;
|
||||
|
||||
if (NULL == jdata) {
|
||||
/* just launching debugger daemons */
|
||||
active_job = ORTE_JOBID_INVALID;
|
||||
goto launch_apps;
|
||||
}
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
|
@ -156,9 +156,16 @@ static int plm_tm_launch_job(orte_job_t *jdata)
|
||||
tm_event_t event;
|
||||
bool failed_launch = true;
|
||||
mode_t current_umask;
|
||||
orte_jobid_t failed_job;
|
||||
orte_jobid_t failed_job, active_job;
|
||||
char *nodelist;
|
||||
|
||||
if (NULL == jdata) {
|
||||
/* just launching debugger daemons */
|
||||
active_job = ORTE_JOBID_INVALID;
|
||||
goto launch_apps;
|
||||
}
|
||||
active_job = jdata->jobid;
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
* then we will not be launching an orted - we will
|
||||
@ -420,12 +427,12 @@ launch_apps:
|
||||
/* since the daemons have launched, any failures now will be for the
|
||||
* application job
|
||||
*/
|
||||
failed_job = jdata->jobid;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(jdata->jobid))) {
|
||||
failed_job = active_job;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:tm: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
@ -87,7 +87,10 @@ orte_job_t *orte_debugger_daemon=NULL;
|
||||
bool orte_enable_debug_cospawn_while_running;
|
||||
int orte_debugger_check_rate;
|
||||
bool orte_output_debugger_proctable=false;
|
||||
char *orte_debugger_test_daemon=NULL;
|
||||
bool orte_debugger_test_attach=false;
|
||||
|
||||
/* exit triggers and flags */
|
||||
orte_trigger_event_t orte_exit, orteds_exit;
|
||||
int orte_exit_status = 0;
|
||||
bool orte_abnormal_term_ordered = false;
|
||||
|
@ -582,6 +582,8 @@ ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
|
||||
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
|
||||
ORTE_DECLSPEC extern int orte_debugger_check_rate;
|
||||
ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
|
||||
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
|
||||
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
|
||||
|
||||
/* exit triggers and flags */
|
||||
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit;
|
||||
|
@ -167,6 +167,16 @@ int orte_register_params(void)
|
||||
true, false, 0, &value);
|
||||
orte_output_debugger_proctable = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
|
||||
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
|
||||
false, false, NULL, &orte_debugger_test_daemon);
|
||||
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"debugger_test_attach",
|
||||
"Test debugger colaunch after debugger attachment",
|
||||
false, false, 0, &value);
|
||||
orte_debugger_test_attach = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "do_not_launch",
|
||||
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
|
||||
false, false, (int)false, &value);
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave_spawn slave cell_spawn reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster hello++ hellof90 early_abort debugger
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
|
31
orte/test/mpi/debugger.c
Обычный файл
31
orte/test/mpi/debugger.c
Обычный файл
@ -0,0 +1,31 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* A program that just spins - provides mechanism for testing user-driven
|
||||
* abnormal program termination
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
int i, rc, j=0;
|
||||
double pi;
|
||||
pid_t pid;
|
||||
|
||||
pid = getpid();
|
||||
|
||||
printf("spin: Pid %ld\n", (long)pid);
|
||||
|
||||
i = 0;
|
||||
while (0 == j) {
|
||||
i++;
|
||||
pi = i / 3.14159256;
|
||||
if (i > 100) i = 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -9,24 +9,16 @@
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
int i, rc, j=0;
|
||||
int i, j=0;
|
||||
double pi;
|
||||
pid_t pid;
|
||||
|
||||
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
|
||||
fprintf(stderr, "spin: couldn't init orte - error code %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
pid = getpid();
|
||||
|
||||
printf("spin: Name %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)pid);
|
||||
printf("spin: Pid %ld\n", (long)pid);
|
||||
|
||||
i = 0;
|
||||
while (0 == j) {
|
||||
|
@ -477,63 +477,60 @@ static void check_debugger(int fd, short event, void *arg)
|
||||
{
|
||||
struct timeval now;
|
||||
opal_event_t *tmp = (opal_event_t*)arg;
|
||||
orte_job_t *jdata;
|
||||
orte_app_context_t *app;
|
||||
char cwd[OPAL_PATH_MAX];
|
||||
int rc;
|
||||
int32_t ljob;
|
||||
|
||||
if (MPIR_being_debugged) {
|
||||
if (MPIR_being_debugged || orte_debugger_test_attach) {
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
MPIR_executable_path);
|
||||
(NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
|
||||
}
|
||||
|
||||
/* a debugger has attached! All the MPIR_Proctable
|
||||
* data is already available, so we only need to
|
||||
* check to see if we should spawn any daemons
|
||||
*/
|
||||
if ('\0' != MPIR_executable_path[0]) {
|
||||
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
|
||||
/* can only have one debugger */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
opal_output(0, "-------------------------------------------\n"
|
||||
"Only one debugger can be used on a job.\n"
|
||||
"-------------------------------------------\n");
|
||||
goto RELEASE;
|
||||
}
|
||||
/* this will be launched just like a regular job,
|
||||
* so we do not use the global orte_debugger_daemon
|
||||
* as this is reserved for co-location upon startup
|
||||
*/
|
||||
jdata = OBJ_NEW(orte_job_t);
|
||||
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
||||
/* create a jobid for these daemons - this is done solely
|
||||
* to avoid confusing the rest of the system's bookkeeping
|
||||
*/
|
||||
orte_plm_base_create_jobid(jdata);
|
||||
orte_plm_base_create_jobid(orte_debugger_daemon);
|
||||
/* flag the job as being debugger daemons */
|
||||
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
||||
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
||||
/* unless directed, we do not forward output */
|
||||
if (!MPIR_forward_output) {
|
||||
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
}
|
||||
/* set the mapping policy to "pernode" so we only get
|
||||
* one debugger daemon on each node
|
||||
*/
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
jdata->map->npernode = 1;
|
||||
orte_debugger_daemon->num_procs = orte_process_info.num_procs;
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
|
||||
/* create an app_context for the debugger daemon */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
|
||||
orte_show_help("help-orterun.txt", "orterun:init-failure",
|
||||
true, "get the cwd", rc);
|
||||
OBJ_RELEASE(jdata);
|
||||
goto RELEASE;
|
||||
if (NULL != orte_debugger_test_daemon) {
|
||||
app->app = strdup(orte_debugger_test_daemon);
|
||||
} else {
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
}
|
||||
app->cwd = strdup(cwd);
|
||||
app->user_specified_cwd = false;
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(jdata->apps, &app->super);
|
||||
jdata->num_apps = 1;
|
||||
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
|
||||
orte_debugger_daemon->num_apps = 1;
|
||||
/* now go ahead and spawn this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
@ -557,7 +554,7 @@ static void check_debugger(int fd, short event, void *arg)
|
||||
* spawn we need to check if we are being run under a TotalView-like
|
||||
* debugger; if so then inform applications via an MCA parameter.
|
||||
*/
|
||||
void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
int orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
{
|
||||
char *env_name;
|
||||
orte_app_context_t *app;
|
||||
@ -573,14 +570,22 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
* to check for debugger attach
|
||||
*/
|
||||
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
return;
|
||||
/* if we were given a test debugger, then we still want to
|
||||
* colaunch it
|
||||
*/
|
||||
if (NULL != orte_debugger_test_daemon) {
|
||||
goto launchit;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
launchit:
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "Info: Spawned by a debugger");
|
||||
}
|
||||
|
||||
|
||||
/* tell the procs they are being debugged */
|
||||
env_name = mca_base_param_environ_variable("orte",
|
||||
"in_parallel_debugger", NULL);
|
||||
@ -594,7 +599,15 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
free(env_name);
|
||||
|
||||
/* check if we need to co-spawn the debugger daemons */
|
||||
if ('\0' != MPIR_executable_path[0]) {
|
||||
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
|
||||
/* can only have one debugger */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
opal_output(0, "-------------------------------------------\n"
|
||||
"Only one debugger can be used on a job.\n"
|
||||
"-------------------------------------------\n");
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* add debugger info to launch message */
|
||||
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
||||
/* create a jobid for these daemons - this is done solely
|
||||
@ -612,12 +625,17 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
|
||||
/* create an app_context for the debugger daemon */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
if (NULL != orte_debugger_test_daemon) {
|
||||
app->app = strdup(orte_debugger_test_daemon);
|
||||
} else {
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
}
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
|
||||
orte_debugger_daemon->num_apps = 1;
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
@ -28,7 +28,7 @@ BEGIN_C_DECLS
|
||||
|
||||
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
|
||||
void orte_debugger_init_before_spawn(orte_job_t *jdata);
|
||||
int orte_debugger_init_before_spawn(orte_job_t *jdata);
|
||||
void orte_debugger_init_after_spawn(orte_job_t *jdata);
|
||||
void orte_debugger_finalize(void);
|
||||
|
||||
|
@ -809,7 +809,9 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* setup for debugging */
|
||||
orte_debugger_init_before_spawn(jdata);
|
||||
if (ORTE_SUCCESS != orte_debugger_init_before_spawn(jdata)) {
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
/* Spawn the job */
|
||||
rc = orte_plm.spawn(jdata);
|
||||
|
@ -627,6 +627,10 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr)
|
||||
if (NULL == jdata->map) {
|
||||
continue;
|
||||
}
|
||||
/* if this is a debugger job, ignore it */
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) {
|
||||
continue;
|
||||
}
|
||||
/* pack the jobid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user