My apologies for doing this outside of the usual time restrictions, but we need to get this in so we can make progress.
Move the ORTE-level debugger code back into orterun and out of the ORTE library to resolve symbol conflicts. This commit was SVN r25713.
Этот коммит содержится в:
родитель
686ee387c8
Коммит
bf103de66c
@ -27,7 +27,6 @@
|
||||
#define OMPI_DEBUGGERS_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -43,8 +42,10 @@ extern void ompi_debugger_notify_abort(char *string);
|
||||
|
||||
/**
|
||||
* Breakpoint function for parallel debuggers.
|
||||
* This function is also defined in orterun for the starter.
|
||||
* It should never conflict with this one
|
||||
*/
|
||||
OMPI_DECLSPEC extern void MPIR_Breakpoint(void);
|
||||
OMPI_DECLSPEC void* MPIR_Breakpoint(void);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -74,7 +74,6 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#if defined(OMPI_MSGQ_DLL)
|
||||
@ -126,8 +125,8 @@ OMPI_DECLSPEC opal_datatype_t* opal_datatype_t_type_force_inclusion = NULL;
|
||||
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_force_inclusion = NULL;
|
||||
|
||||
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
|
||||
OMPI_DECLSPEC extern volatile int MPIR_being_debugged;
|
||||
OMPI_DECLSPEC extern volatile int MPIR_debug_state;
|
||||
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
|
||||
OMPI_DECLSPEC volatile int MPIR_debug_state = 0;
|
||||
OMPI_DECLSPEC char *MPIR_debug_abort_string = "";
|
||||
|
||||
/* Check for a file in few direct ways for portability */
|
||||
@ -283,3 +282,13 @@ void ompi_debugger_notify_abort(char *reason)
|
||||
/* Now tell the debugger */
|
||||
MPIR_Breakpoint();
|
||||
}
|
||||
|
||||
/*
|
||||
* Breakpoint function for parallel debuggers. This function is also
|
||||
* defined in orterun for the starter. It should never conflict with
|
||||
* this
|
||||
*/
|
||||
void* MPIR_Breakpoint(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
@ -110,8 +110,6 @@
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/oob/oob.h"
|
||||
@ -431,14 +429,6 @@ void ompi_info_open_components(void)
|
||||
map->components = &orte_notifier_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
|
||||
if (ORTE_SUCCESS != orte_debugger_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
map = OBJ_NEW(ompi_info_component_map_t);
|
||||
map->type = strdup("debugger");
|
||||
map->components = &orte_debugger_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
|
||||
if (ORTE_SUCCESS != mca_oob_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
|
@ -233,7 +233,6 @@ int main(int argc, char *argv[])
|
||||
#endif
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
opal_pointer_array_add(&mca_types, "debugger");
|
||||
opal_pointer_array_add(&mca_types, "iof");
|
||||
opal_pointer_array_add(&mca_types, "oob");
|
||||
opal_pointer_array_add(&mca_types, "odls");
|
||||
|
@ -1,28 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# main library setup
|
||||
noinst_LTLIBRARIES = libmca_debugger.la
|
||||
libmca_debugger_la_SOURCES =
|
||||
|
||||
# local files
|
||||
headers = debugger.h
|
||||
|
||||
libmca_debugger_la_SOURCES += $(headers)
|
||||
|
||||
# Conditionally install the header files
|
||||
if WANT_INSTALL_HEADERS
|
||||
ortedir = $(includedir)/openmpi/$(subdir)
|
||||
nobase_orte_HEADERS = $(headers)
|
||||
endif
|
||||
|
||||
include base/Makefile.am
|
||||
|
||||
distclean-local:
|
||||
rm -f base/static-components.h
|
@ -1,18 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
headers += \
|
||||
base/base.h
|
||||
|
||||
libmca_debugger_la_SOURCES += \
|
||||
base/debugger_base_close.c \
|
||||
base/debugger_base_select.c \
|
||||
base/debugger_base_open.c \
|
||||
base/debugger_base_fns.c
|
||||
|
@ -1,82 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*/
|
||||
|
||||
#ifndef MCA_DEBUGGER_BASE_H
|
||||
#define MCA_DEBUGGER_BASE_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
typedef struct {
|
||||
int output;
|
||||
bool dump_proctable;
|
||||
char *test_daemon;
|
||||
bool test_attach;
|
||||
} orte_debugger_base_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_debugger_base_t orte_debugger_base;
|
||||
|
||||
/*
|
||||
* function definitions
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_debugger_base_open(void);
|
||||
ORTE_DECLSPEC int orte_debugger_base_close(void);
|
||||
|
||||
ORTE_DECLSPEC int orte_debugger_base_select(void);
|
||||
ORTE_DECLSPEC void orte_debugger_base_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
int argc, char *argv[], int num_procs);
|
||||
ORTE_DECLSPEC void orte_debugger_base_init_after_spawn(orte_job_t *jdata);
|
||||
ORTE_DECLSPEC void orte_debugger_base_dump(void);
|
||||
|
||||
ORTE_DECLSPEC extern opal_list_t orte_debugger_base_components_available;
|
||||
|
||||
/* +++ begin MPICH/TotalView std debugger interface definitions */
|
||||
|
||||
#define MPIR_MAX_PATH_LENGTH 512
|
||||
#define MPIR_MAX_ARG_LENGTH 1024
|
||||
|
||||
struct MPIR_PROCDESC {
|
||||
char *host_name; /* something that can be passed to inet_addr */
|
||||
char *executable_name; /* name of binary */
|
||||
int pid; /* process pid */
|
||||
};
|
||||
|
||||
ORTE_DECLSPEC extern struct MPIR_PROCDESC *MPIR_proctable;
|
||||
ORTE_DECLSPEC extern int MPIR_proctable_size;
|
||||
ORTE_DECLSPEC extern volatile int MPIR_being_debugged;
|
||||
ORTE_DECLSPEC extern volatile int MPIR_debug_state;
|
||||
ORTE_DECLSPEC extern int MPIR_i_am_starter;
|
||||
ORTE_DECLSPEC extern int MPIR_partial_attach_ok;
|
||||
ORTE_DECLSPEC extern char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
|
||||
ORTE_DECLSPEC extern char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
|
||||
ORTE_DECLSPEC extern volatile int MPIR_forward_output;
|
||||
ORTE_DECLSPEC extern volatile int MPIR_forward_comm;
|
||||
ORTE_DECLSPEC extern char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
|
||||
ORTE_DECLSPEC extern int MPIR_force_to_main;
|
||||
|
||||
typedef void (*orte_debugger_breakpoint_fn_t)(void);
|
||||
|
||||
ORTE_DECLSPEC void MPIR_Breakpoint(void);
|
||||
|
||||
ORTE_DECLSPEC void orte_debugger_base_pull_mpir_breakpoint(void);
|
||||
|
||||
/* --- end MPICH/TotalView std debugger interface definitions */
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
@ -1,33 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
|
||||
int orte_debugger_base_close(void)
|
||||
{
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
if (NULL != orte_debugger.finalize) {
|
||||
orte_debugger.finalize();
|
||||
}
|
||||
|
||||
/* Close all remaining available components */
|
||||
mca_base_components_close(orte_debugger_base.output,
|
||||
&orte_debugger_base_components_available, NULL);
|
||||
#endif
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,201 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif /* HAVE_STDLIB_H */
|
||||
#ifdef HAVE_STRINGS_H
|
||||
#include <strings.h>
|
||||
#endif /* HAVE_STRINGS_H */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
|
||||
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
|
||||
void orte_debugger_base_dump(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
DUMP_INT(MPIR_being_debugged);
|
||||
DUMP_INT(MPIR_debug_state);
|
||||
DUMP_INT(MPIR_partial_attach_ok);
|
||||
DUMP_INT(MPIR_i_am_starter);
|
||||
DUMP_INT(MPIR_forward_output);
|
||||
DUMP_INT(MPIR_proctable_size);
|
||||
fprintf(stderr, " MPIR_proctable:\n");
|
||||
for (i = 0; i < MPIR_proctable_size; i++) {
|
||||
fprintf(stderr,
|
||||
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
|
||||
i,
|
||||
MPIR_proctable[i].host_name,
|
||||
MPIR_proctable[i].executable_name,
|
||||
MPIR_proctable[i].pid);
|
||||
}
|
||||
fprintf(stderr, "MPIR_executable_path: %s\n",
|
||||
('\0' == MPIR_executable_path[0]) ?
|
||||
"NULL" : (char*) MPIR_executable_path);
|
||||
fprintf(stderr, "MPIR_server_arguments: %s\n",
|
||||
('\0' == MPIR_server_arguments[0]) ?
|
||||
"NULL" : (char*) MPIR_server_arguments);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialization of data structures for running under a debugger
|
||||
* using the MPICH/TotalView parallel debugger interface. This stage
|
||||
* of initialization must occur after spawn
|
||||
*
|
||||
* NOTE: We -always- perform this step to ensure that any debugger
|
||||
* that attaches to us post-launch of the application can get a
|
||||
* completed proctable
|
||||
*/
|
||||
void orte_debugger_base_init_after_spawn(orte_job_t *jdata)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
orte_app_context_t *appctx;
|
||||
orte_vpid_t i, j;
|
||||
opal_buffer_t buf;
|
||||
orte_process_name_t rank0;
|
||||
int rc;
|
||||
|
||||
/* if we couldn't get thru the mapper stage, we might
|
||||
* enter here with no procs. Avoid the "zero byte malloc"
|
||||
* message by checking here
|
||||
*/
|
||||
if (MPIR_proctable || 0 == jdata->num_procs) {
|
||||
/* already initialized */
|
||||
opal_output_verbose(5, orte_debugger_base.output,
|
||||
"%s: debugger already initialized or zero procs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
/* fill in the proc table for the application processes */
|
||||
|
||||
opal_output_verbose(5, orte_debugger_base.output,
|
||||
"%s: Setting up debugger process table for applications",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
MPIR_debug_state = 1;
|
||||
|
||||
/* set the total number of processes in the job */
|
||||
MPIR_proctable_size = jdata->num_procs;
|
||||
|
||||
/* allocate MPIR_proctable */
|
||||
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
|
||||
MPIR_proctable_size);
|
||||
if (MPIR_proctable == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return;
|
||||
}
|
||||
|
||||
if (orte_debugger_base.dump_proctable) {
|
||||
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
|
||||
}
|
||||
|
||||
/* initialize MPIR_proctable */
|
||||
for (j=0; j < jdata->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* store this data in the location whose index
|
||||
* corresponds to the proc's rank
|
||||
*/
|
||||
i = proc->name.vpid;
|
||||
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
MPIR_proctable[i].host_name = strdup(proc->node->name);
|
||||
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
|
||||
MPIR_proctable[i].executable_name =
|
||||
opal_os_path( false, appctx->app, NULL );
|
||||
} else {
|
||||
MPIR_proctable[i].executable_name =
|
||||
opal_os_path( false, appctx->cwd, appctx->app, NULL );
|
||||
}
|
||||
MPIR_proctable[i].pid = proc->pid;
|
||||
if (orte_debugger_base.dump_proctable) {
|
||||
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
|
||||
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
|
||||
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
|
||||
}
|
||||
}
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_debugger_base.output)) {
|
||||
orte_debugger_base_dump();
|
||||
}
|
||||
|
||||
/* if we are being launched under a debugger, then we must wait
|
||||
* for it to be ready to go and do some things to start the job
|
||||
*/
|
||||
if (MPIR_being_debugged) {
|
||||
/* wait for all procs to have reported their contact info - this
|
||||
* ensures that (a) they are all into mpi_init, and (b) the system
|
||||
* has the contact info to successfully send a message to rank=0
|
||||
*/
|
||||
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
|
||||
|
||||
MPIR_Breakpoint();
|
||||
|
||||
/* send a message to rank=0 to release it */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
|
||||
rank0.jobid = jdata->jobid;
|
||||
rank0.vpid = 0;
|
||||
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
|
||||
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Dummy function so that the linker can pull in all the symbols from
|
||||
* this file.
|
||||
*/
|
||||
void orte_debugger_base_pull_mpir_breakpoint(void)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Breakpoint function for parallel debuggers
|
||||
*/
|
||||
void MPIR_Breakpoint(void)
|
||||
{
|
||||
return;
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
* component's public mca_base_component_t struct.
|
||||
*/
|
||||
|
||||
#include "orte/mca/debugger/base/static-components.h"
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
*/
|
||||
orte_debugger_base_t orte_debugger_base;
|
||||
opal_list_t orte_debugger_base_components_available;
|
||||
|
||||
orte_debugger_base_module_t orte_debugger;
|
||||
|
||||
/* instance the standard MPIR interfaces */
|
||||
struct MPIR_PROCDESC *MPIR_proctable = NULL;
|
||||
int MPIR_proctable_size = 0;
|
||||
volatile int MPIR_being_debugged = 0;
|
||||
volatile int MPIR_debug_state = 0;
|
||||
int MPIR_i_am_starter = 0;
|
||||
int MPIR_partial_attach_ok = 1;
|
||||
char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
|
||||
char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
|
||||
volatile int MPIR_forward_output = 0;
|
||||
volatile int MPIR_forward_comm = 0;
|
||||
char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
|
||||
int MPIR_force_to_main = 0;
|
||||
|
||||
#if ORTE_DISABLE_FULL_SUPPORT
|
||||
int orte_debugger_base_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/**
|
||||
* Function for finding and opening either all MCA components, or the one
|
||||
* that was specifically requested via a MCA parameter.
|
||||
*/
|
||||
int orte_debugger_base_open(void)
|
||||
{
|
||||
int value;
|
||||
|
||||
/* Debugging / verbose output. Always have stream open, with
|
||||
verbose set by the mca open system... */
|
||||
orte_debugger_base.output = opal_output_open(NULL);
|
||||
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"output_debugger_proctable",
|
||||
"Whether or not to output the debugger proctable after launch (default: false)",
|
||||
true, false, 0, &value);
|
||||
orte_debugger_base.dump_proctable = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
|
||||
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
|
||||
false, false, NULL, &orte_debugger_base.test_daemon);
|
||||
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"debugger_test_attach",
|
||||
"Test debugger colaunch after debugger attachment",
|
||||
false, false, 0, &value);
|
||||
orte_debugger_base.test_attach = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
mca_base_components_open("debugger", orte_debugger_base.output,
|
||||
mca_debugger_base_static_components,
|
||||
&orte_debugger_base_components_available,
|
||||
true)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
#endif
|
@ -1,54 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
|
||||
int orte_debugger_base_select(void)
|
||||
{
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
orte_debugger_base_module_t *best_module=NULL;
|
||||
orte_debugger_base_component_t *best_component=NULL;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Select the best component
|
||||
*/
|
||||
if( OPAL_SUCCESS != mca_base_select("debugger", orte_debugger_base.output,
|
||||
&orte_debugger_base_components_available,
|
||||
(mca_base_module_t **) &best_module,
|
||||
(mca_base_component_t **) &best_component) ) {
|
||||
/* This will only happen if no component was selected */
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Save the winner */
|
||||
/* No global component structure */
|
||||
orte_debugger = *best_module;
|
||||
|
||||
ret = orte_debugger.init();
|
||||
|
||||
cleanup:
|
||||
return ret;
|
||||
#else
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
#endif
|
||||
}
|
@ -1,77 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All Rights Reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_DEBUGGER_H
|
||||
#define MCA_DEBUGGER_H
|
||||
|
||||
/*
|
||||
* includes
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
*/
|
||||
|
||||
/* initialize the selected module */
|
||||
typedef int (*orte_debugger_base_module_init_fn_t)(void);
|
||||
|
||||
/* finalize the selected module */
|
||||
typedef void (*orte_debugger_base_module_finalize_fn_t)(void);
|
||||
|
||||
/* init debuggers before spawn */
|
||||
typedef void (*orte_debugger_base_module_init_before_spawn_fn_t)(orte_job_t *jdata);
|
||||
|
||||
/* init debuggers after spawn */
|
||||
typedef void (*orte_debugger_base_module_init_after_spawn_fn_t)(orte_job_t *jdata);
|
||||
|
||||
/*
|
||||
* Ver 1.0
|
||||
*/
|
||||
struct orte_debugger_base_module_1_0_0_t {
|
||||
orte_debugger_base_module_init_fn_t init;
|
||||
orte_debugger_base_module_finalize_fn_t finalize;
|
||||
orte_debugger_base_module_init_before_spawn_fn_t init_before_spawn;
|
||||
orte_debugger_base_module_init_after_spawn_fn_t init_after_spawn;
|
||||
};
|
||||
|
||||
typedef struct orte_debugger_base_module_1_0_0_t orte_debugger_base_module_1_0_0_t;
|
||||
typedef orte_debugger_base_module_1_0_0_t orte_debugger_base_module_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_debugger_base_module_t orte_debugger;
|
||||
|
||||
/*
|
||||
* the standard component data structure
|
||||
*/
|
||||
struct orte_debugger_base_component_1_0_0_t {
|
||||
mca_base_component_t base_version;
|
||||
mca_base_component_data_t base_data;
|
||||
};
|
||||
typedef struct orte_debugger_base_component_1_0_0_t orte_debugger_base_component_1_0_0_t;
|
||||
typedef orte_debugger_base_component_1_0_0_t orte_debugger_base_component_t;
|
||||
|
||||
/*
|
||||
* Macro for use in components that are of type debugger v1.0.0
|
||||
*/
|
||||
#define ORTE_DEBUGGER_BASE_VERSION_1_0_0 \
|
||||
/* debugger v1.0 is chained to MCA v2.0 */ \
|
||||
MCA_BASE_VERSION_2_0_0, \
|
||||
/* debugger v1.0 */ \
|
||||
"debugger", 1, 0, 0
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_DEBUGGER_H */
|
@ -1,12 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
mca_link_libraries=libopen-rte Ws2_32.lib
|
@ -1,36 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = .windows
|
||||
|
||||
sources = \
|
||||
mpir.h \
|
||||
mpir.c \
|
||||
mpir_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_debugger_mpir_DSO
|
||||
component_noinst =
|
||||
component_install = mca_debugger_mpir.la
|
||||
else
|
||||
component_noinst = libmca_debugger_mpir.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_debugger_mpir_la_SOURCES = $(sources)
|
||||
mca_debugger_mpir_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_debugger_mpir_la_SOURCES =$(sources)
|
||||
libmca_debugger_mpir_la_LDFLAGS = -module -avoid-version
|
@ -1,19 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# MCA_debugger_mpir_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_debugger_mpir_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/debugger/mpir/Makefile])
|
||||
|
||||
AS_IF([test "$orte_without_full_support" = 0],
|
||||
[$1],
|
||||
[$2])
|
||||
])
|
@ -1,196 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/*
|
||||
* Debugger support for orterun
|
||||
*
|
||||
* We interpret the MPICH debugger interface as follows:
|
||||
*
|
||||
* a) The launcher
|
||||
* - spawns the other processes,
|
||||
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
|
||||
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
|
||||
* - calls MPIR_Breakpoint() which the debugger will have a
|
||||
* breakpoint on.
|
||||
*
|
||||
* b) Applications start and then spin until MPIR_debug_gate is set
|
||||
* non-zero by the debugger.
|
||||
*
|
||||
* This file implements (a).
|
||||
*
|
||||
**************************************************************************
|
||||
*
|
||||
* Note that we have presently tested both TotalView and DDT parallel
|
||||
* debuggers. They both nominally subscribe to the Etnus attaching
|
||||
* interface, but there are differences between the two.
|
||||
*
|
||||
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
|
||||
* TV launches mpirun. mpirun launches the application and then calls
|
||||
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
|
||||
* MPI job. TV then reads the proctable in mpirun and attaches itself
|
||||
* to all the processes (it takes care of launching itself on the
|
||||
* remote nodes). Upon attaching to all the MPI processes, the
|
||||
* variable MPIR_being_debugged is set to 1. When it has finished
|
||||
* attaching itself to all the MPI processes that it wants to,
|
||||
* MPIR_Breakpoint() returns.
|
||||
*
|
||||
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
|
||||
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
|
||||
* X ddt-debugger" (not the lack of other arguments -- we can't pass
|
||||
* anything to mpirun). This app will eventually fork/exec the MPI
|
||||
* app. DDT does not current set MPIR_being_debugged in the MPI app.
|
||||
*
|
||||
**************************************************************************
|
||||
*
|
||||
* We support two ways of waiting for attaching debuggers. The
|
||||
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
|
||||
*
|
||||
* 1. If using orterun: MPI processes will have the
|
||||
* orte_in_parallel_debugger MCA param set to true (because not all
|
||||
* debuggers consistently set MPIR_being_debugged in both the launcher
|
||||
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
|
||||
* then RML send a message to VPID 0 (MCW rank 0) when it returns
|
||||
* (MPIR_Breakpoint() doesn't return until the debugger has attached
|
||||
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
|
||||
* the RML message. All other VPIDs immediately call the grpcomm
|
||||
* barrier (and therefore block until the debugger attaches). Once
|
||||
* VPID 0 receives the RML message, we know that the debugger has
|
||||
* attached to all processes that it cares about, and VPID 0 then
|
||||
* joins the grpcomm barrier, allowing the job to continue. This
|
||||
* scheme has the side effect of nicely supporting partial attaches by
|
||||
* parallel debuggers (i.e., attaching to only some of the MPI
|
||||
* processes; not necessarily all of them).
|
||||
*
|
||||
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
|
||||
* will be true, and we know that there will not be an RML message
|
||||
* sent to VPID 0. So we have to look for a magic environment
|
||||
* variable from the launcher to know if the jobs will be attached by
|
||||
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
|
||||
* MPIR_debug_gate. These environment variable names must be
|
||||
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif /* HAVE_STDLIB_H */
|
||||
#ifdef HAVE_STRINGS_H
|
||||
#include <strings.h>
|
||||
#endif /* HAVE_STRINGS_H */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#include <ctype.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/opal_getcwd.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "mpir.h"
|
||||
|
||||
|
||||
#include "mpir.h"
|
||||
|
||||
/* Static API's */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void init_before_spawn(orte_job_t *jdata);
|
||||
|
||||
/* Module definition */
|
||||
orte_debugger_base_module_t orte_debugger_mpir_module = {
|
||||
init,
|
||||
finalize,
|
||||
init_before_spawn,
|
||||
orte_debugger_base_init_after_spawn
|
||||
};
|
||||
|
||||
/* local globals */
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Release resources associated with data structures for running under
|
||||
* a debugger using the MPICH/TotalView parallel debugger interface.
|
||||
*/
|
||||
void finalize(void)
|
||||
{
|
||||
if (MPIR_proctable) {
|
||||
free(MPIR_proctable);
|
||||
MPIR_proctable = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialization of data structures for running under a debugger
|
||||
* using the MPICH/TotalView parallel debugger interface. Before the
|
||||
* spawn we need to check if we are being run under a TotalView-like
|
||||
* debugger; if so then inform applications via an MCA parameter.
|
||||
*/
|
||||
void init_before_spawn(orte_job_t *jdata)
|
||||
{
|
||||
char *env_name;
|
||||
orte_app_context_t *app;
|
||||
int i;
|
||||
|
||||
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(1, orte_debugger_base.output, "Info: Spawned by a debugger");
|
||||
|
||||
/* tell the procs they are being debugged */
|
||||
env_name = mca_base_param_environ_variable("orte",
|
||||
"in_parallel_debugger", NULL);
|
||||
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
opal_setenv(env_name, "1", true, &app->env);
|
||||
}
|
||||
free(env_name);
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef DEBUGGER_MPIR_H
|
||||
#define DEBUGGER_MPIR_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_debugger_base_component_t mca_debugger_mpir_component;
|
||||
extern orte_debugger_base_module_t orte_debugger_mpir_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_DEBUGGERS_H */
|
@ -1,47 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "mpir.h"
|
||||
|
||||
|
||||
static int component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
orte_debugger_base_component_t mca_debugger_mpir_component = {
|
||||
{
|
||||
ORTE_DEBUGGER_BASE_VERSION_1_0_0,
|
||||
|
||||
"mpir", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
component_query /* module query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
static int component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&orte_debugger_mpir_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
mpirx.h \
|
||||
mpirx.c \
|
||||
mpirx_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_debugger_mpirx_DSO
|
||||
component_noinst =
|
||||
component_install = mca_debugger_mpirx.la
|
||||
else
|
||||
component_noinst = libmca_debugger_mpirx.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_debugger_mpirx_la_SOURCES = $(sources)
|
||||
mca_debugger_mpirx_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_debugger_mpirx_la_SOURCES =$(sources)
|
||||
libmca_debugger_mpirx_la_LDFLAGS = -module -avoid-version
|
@ -1,19 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# MCA_debugger_mpirx_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_debugger_mpirx_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/debugger/mpirx/Makefile])
|
||||
|
||||
AS_IF([test "$orte_without_full_support" = 0],
|
||||
[$1],
|
||||
[$2])
|
||||
])
|
@ -1,366 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif /* HAVE_STDLIB_H */
|
||||
#ifdef HAVE_STRINGS_H
|
||||
#include <strings.h>
|
||||
#endif /* HAVE_STRINGS_H */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_SYS_TYPES_H
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#include <sys/stat.h>
|
||||
#include <ctype.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/path.h"
|
||||
#include "opal/util/os_path.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/opal_getcwd.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "mpirx.h"
|
||||
|
||||
#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
|
||||
|
||||
/* Static API's */
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void init_before_spawn(orte_job_t *jdata);
|
||||
|
||||
/* Module definition */
|
||||
orte_debugger_base_module_t orte_debugger_mpirx_module = {
|
||||
init,
|
||||
finalize,
|
||||
init_before_spawn,
|
||||
orte_debugger_base_init_after_spawn
|
||||
};
|
||||
|
||||
/* local globals and functions */
|
||||
static void attach_debugger(int fd, short event, void *arg);
|
||||
static void build_debugger_args(orte_app_context_t *debugger);
|
||||
static void open_fifo(void);
|
||||
static opal_event_t attach;
|
||||
static int attach_fd = -1;
|
||||
static bool fifo_active=false;
|
||||
|
||||
static int init(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Release resources associated with data structures for running under
|
||||
* a debugger using the MPICH/TotalView parallel debugger interface.
|
||||
*/
|
||||
void finalize(void)
|
||||
{
|
||||
if (fifo_active) {
|
||||
opal_event_del(&attach);
|
||||
close(attach_fd);
|
||||
}
|
||||
|
||||
if (MPIR_proctable) {
|
||||
free(MPIR_proctable);
|
||||
MPIR_proctable = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialization of data structures for running under a debugger
|
||||
* using an extended MPICH/TotalView parallel debugger interface. Before the
|
||||
* spawn we need to check if we are being run under a TotalView-like
|
||||
* debugger; if so then inform applications via an MCA parameter.
|
||||
*/
|
||||
void init_before_spawn(orte_job_t *jdata)
|
||||
{
|
||||
char *env_name;
|
||||
orte_app_context_t *app;
|
||||
int i;
|
||||
int32_t ljob;
|
||||
char *attach_fifo;
|
||||
|
||||
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
|
||||
/* if we were given a test debugger, then we still want to
|
||||
* colaunch it
|
||||
*/
|
||||
if (NULL != orte_debugger_base.test_daemon) {
|
||||
opal_output_verbose(2, orte_debugger_base.output,
|
||||
"%s No debugger test daemon specified",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
goto launchit;
|
||||
}
|
||||
/* if we were given an auto-detect rate, then we want to setup
|
||||
* an event so we periodically do the check
|
||||
*/
|
||||
if (0 < orte_debugger_mpirx_check_rate) {
|
||||
opal_output_verbose(2, orte_debugger_base.output,
|
||||
"%s Setting debugger attach check rate for %d seconds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_debugger_mpirx_check_rate);
|
||||
ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
|
||||
} else {
|
||||
/* create the attachment FIFO and put it into MPIR, setup readevent */
|
||||
/* create a FIFO name in the session dir */
|
||||
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
|
||||
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
|
||||
opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
|
||||
free(attach_fifo);
|
||||
return;
|
||||
}
|
||||
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1);
|
||||
free (attach_fifo);
|
||||
open_fifo ();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
launchit:
|
||||
opal_output_verbose(2, orte_debugger_base.output,
|
||||
"%s: Spawned by a debugger",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
/* tell the procs they are being debugged */
|
||||
env_name = mca_base_param_environ_variable("orte",
|
||||
"in_parallel_debugger", NULL);
|
||||
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
opal_setenv(env_name, "1", true, &app->env);
|
||||
}
|
||||
free(env_name);
|
||||
|
||||
/* check if we need to co-spawn the debugger daemons */
|
||||
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) {
|
||||
/* can only have one debugger */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
opal_output(0, "-------------------------------------------\n"
|
||||
"Only one debugger can be used on a job.\n"
|
||||
"-------------------------------------------\n");
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return;
|
||||
}
|
||||
opal_output_verbose(2, orte_debugger_base.output,
|
||||
"%s Cospawning debugger daemons %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == orte_debugger_base.test_daemon) ?
|
||||
MPIR_executable_path : orte_debugger_base.test_daemon);
|
||||
/* add debugger info to launch message */
|
||||
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
||||
/* create a jobid for these daemons - this is done solely
|
||||
* to avoid confusing the rest of the system's bookkeeping
|
||||
*/
|
||||
orte_plm_base_create_jobid(orte_debugger_daemon);
|
||||
/* flag the job as being debugger daemons */
|
||||
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
||||
/* unless directed, we do not forward output */
|
||||
if (!MPIR_forward_output) {
|
||||
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
}
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
|
||||
/* create an app_context for the debugger daemon */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
if (NULL != orte_debugger_base.test_daemon) {
|
||||
app->app = strdup(orte_debugger_base.test_daemon);
|
||||
} else {
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
}
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(orte_debugger_daemon->apps, app);
|
||||
orte_debugger_daemon->num_apps = 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void open_fifo (void)
|
||||
{
|
||||
if (attach_fd > 0) {
|
||||
close(attach_fd);
|
||||
}
|
||||
|
||||
attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
|
||||
if (attach_fd < 0) {
|
||||
opal_output(0, "%s unable to open debugger attach fifo",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
opal_output_verbose(2, orte_debugger_base.output,
|
||||
"%s Monitoring debugger attach fifo %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
MPIR_attach_fifo);
|
||||
opal_event_set(opal_event_base, &attach, attach_fd, OPAL_EV_READ, attach_debugger, NULL);
|
||||
|
||||
fifo_active = true;
|
||||
opal_event_add(&attach, 0);
|
||||
}
|
||||
|
||||
static void attach_debugger(int fd, short event, void *arg)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
unsigned char fifo_cmd;
|
||||
int rc;
|
||||
int32_t ljob;
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* read the file descriptor to clear that event, if necessary */
|
||||
if (fifo_active) {
|
||||
opal_event_del(&attach);
|
||||
fifo_active = false;
|
||||
|
||||
rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd));
|
||||
if (!rc) {
|
||||
/* reopen device to clear hangup */
|
||||
open_fifo();
|
||||
return;
|
||||
}
|
||||
if (1 != fifo_cmd) {
|
||||
/* ignore the cmd */
|
||||
goto RELEASE;
|
||||
}
|
||||
}
|
||||
|
||||
if (!MPIR_being_debugged && !orte_debugger_base.test_attach) {
|
||||
/* false alarm */
|
||||
goto RELEASE;
|
||||
}
|
||||
|
||||
opal_output_verbose(1, orte_debugger_base.output,
|
||||
"%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon);
|
||||
|
||||
/* a debugger has attached! All the MPIR_Proctable
|
||||
* data is already available, so we only need to
|
||||
* check to see if we should spawn any daemons
|
||||
*/
|
||||
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) {
|
||||
/* can only have one debugger */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
opal_output(0, "-------------------------------------------\n"
|
||||
"Only one debugger can be used on a job.\n"
|
||||
"-------------------------------------------\n");
|
||||
goto RELEASE;
|
||||
}
|
||||
opal_output_verbose(2, orte_debugger_base.output,
|
||||
"%s Spawning debugger daemons %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == orte_debugger_base.test_daemon) ?
|
||||
MPIR_executable_path : orte_debugger_base.test_daemon);
|
||||
/* this will be launched just like a regular job,
|
||||
* so we do not use the global orte_debugger_daemon
|
||||
* as this is reserved for co-location upon startup
|
||||
*/
|
||||
jdata = OBJ_NEW(orte_job_t);
|
||||
/* create a jobid for these daemons - this is done solely
|
||||
* to avoid confusing the rest of the system's bookkeeping
|
||||
*/
|
||||
orte_plm_base_create_jobid(jdata);
|
||||
/* flag the job as being debugger daemons */
|
||||
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
||||
/* unless directed, we do not forward output */
|
||||
if (!MPIR_forward_output) {
|
||||
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
}
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
/* create an app_context for the debugger daemon */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
if (NULL != orte_debugger_base.test_daemon) {
|
||||
app->app = strdup(orte_debugger_base.test_daemon);
|
||||
} else {
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
}
|
||||
|
||||
jdata->state = ORTE_JOB_STATE_INIT;
|
||||
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(jdata->apps, app);
|
||||
jdata->num_apps = 1;
|
||||
/* setup the mapping policy to pernode so we get one
|
||||
* daemon on each node
|
||||
*/
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
jdata->map->mapping = ORTE_MAPPING_PPR;
|
||||
jdata->map->ppr = strdup("1:n");
|
||||
/* now go ahead and spawn this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
RELEASE:
|
||||
/* reset the read or timer event */
|
||||
if (0 == orte_debugger_mpirx_check_rate) {
|
||||
fifo_active = true;
|
||||
opal_event_add(&attach, 0);
|
||||
} else if (!MPIR_being_debugged) {
|
||||
ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
|
||||
}
|
||||
|
||||
/* notify the debugger that all is ready */
|
||||
MPIR_Breakpoint();
|
||||
}
|
||||
|
||||
static void build_debugger_args(orte_app_context_t *debugger)
|
||||
{
|
||||
int i, j;
|
||||
char mpir_arg[MPIR_MAX_ARG_LENGTH];
|
||||
|
||||
if ('\0' != MPIR_server_arguments[0]) {
|
||||
j=0;
|
||||
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
|
||||
for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
|
||||
if (MPIR_server_arguments[i] == '\0') {
|
||||
if (0 < j) {
|
||||
opal_argv_append_nosize(&debugger->argv, mpir_arg);
|
||||
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
|
||||
j=0;
|
||||
}
|
||||
} else {
|
||||
mpir_arg[j] = MPIR_server_arguments[i];
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
#ifndef DEBUGGER_MPIRX_H
|
||||
#define DEBUGGER_MPIRX_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_debugger_base_component_t mca_debugger_mpirx_component;
|
||||
extern int orte_debugger_mpirx_check_rate;
|
||||
extern orte_debugger_base_module_t orte_debugger_mpirx_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
@ -1,58 +0,0 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "mpirx.h"
|
||||
|
||||
int orte_debugger_mpirx_check_rate=0;
|
||||
|
||||
static int component_open(void);
|
||||
static int component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Struct of function pointers that need to be initialized
|
||||
*/
|
||||
orte_debugger_base_component_t mca_debugger_mpirx_component = {
|
||||
{
|
||||
ORTE_DEBUGGER_BASE_VERSION_1_0_0,
|
||||
|
||||
"mpirx", /* MCA module name */
|
||||
ORTE_MAJOR_VERSION, /* MCA module major version */
|
||||
ORTE_MINOR_VERSION, /* MCA module minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA module release version */
|
||||
|
||||
component_open,
|
||||
NULL,
|
||||
component_query /* module query */
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
static int component_open(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_debugger_mpirx_component.base_version;
|
||||
|
||||
mca_base_param_reg_int(c, "check_rate",
|
||||
"Set rate (in secs) for auto-detect of debugger attachment (0 => do not check)",
|
||||
false, false, 0, &orte_debugger_mpirx_check_rate);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
*priority = 10;
|
||||
*module = (mca_base_module_t *)&orte_debugger_mpirx_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -41,7 +41,6 @@
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -526,9 +525,6 @@ static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
|
||||
/* if debuggers are running, clean up */
|
||||
orte_debugger.finalize();
|
||||
|
||||
/* set control params to indicate we are terminating */
|
||||
orte_job_term_ordered = true;
|
||||
orte_abnormal_term_ordered = true;
|
||||
|
@ -58,8 +58,6 @@
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/sensor/base/base.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
@ -622,18 +620,6 @@ static int rte_init(void)
|
||||
/* start the local sensors */
|
||||
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* start the debuggers */
|
||||
if (ORTE_SUCCESS != (ret = orte_debugger_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_debugger_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_debugger_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_debugger_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* if a tool has launched us and is requesting event reports,
|
||||
* then set its contact info into the comm system
|
||||
*/
|
||||
@ -701,9 +687,6 @@ static int rte_finalize(void)
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
/* stop the debuggers */
|
||||
orte_debugger_base_close();
|
||||
|
||||
/* stop the local sensors */
|
||||
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
@ -1003,9 +986,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
* to terminate!
|
||||
*/
|
||||
if (!orte_never_launched) {
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger.finalize();
|
||||
|
||||
/*
|
||||
* Turn off the process recovery functionality, if it was enabled.
|
||||
* This keeps the errmgr from trying to recover from the shutdown
|
||||
|
@ -41,7 +41,6 @@
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
@ -230,8 +229,6 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto WAKEUP;
|
||||
}
|
||||
/* setup for debugging */
|
||||
orte_debugger.init_before_spawn(jdata);
|
||||
}
|
||||
|
||||
/* setup the buffer */
|
||||
@ -310,10 +307,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto WAKEUP;
|
||||
}
|
||||
|
||||
/* complete debugger interface */
|
||||
orte_debugger.init_after_spawn(jdata);
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:launch completed for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -41,7 +41,6 @@
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
@ -263,7 +262,7 @@ static void process_msg(int fd, short event, void *data)
|
||||
job = jdata->jobid;
|
||||
|
||||
/* output debugger proctable, if requested */
|
||||
if (orte_debugger_base.dump_proctable) {
|
||||
if (orte_debugger_dump_proctable && !jdata->map->display_map) {
|
||||
char *output;
|
||||
opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
|
||||
if (orte_xml_output) {
|
||||
|
@ -80,7 +80,6 @@
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
|
@ -91,6 +91,11 @@ char **orte_fork_agent=NULL;
|
||||
|
||||
/* debugger job */
|
||||
orte_job_t *orte_debugger_daemon=NULL;
|
||||
bool orte_debugger_dump_proctable;
|
||||
char *orte_debugger_test_daemon;
|
||||
bool orte_debugger_test_attach;
|
||||
bool orte_debugger_enable_fifo_attach;
|
||||
int orte_debugger_check_rate;
|
||||
|
||||
/* exit flags */
|
||||
int orte_exit_status = 0;
|
||||
|
@ -78,8 +78,6 @@ ORTE_DECLSPEC extern orte_process_name_t orte_name_invalid; /** instantiated in
|
||||
/* define the name of my daemon */
|
||||
#define ORTE_PROC_MY_DAEMON (&orte_process_info.my_daemon)
|
||||
|
||||
/* See comment in orte/tools/orterun/debuggers.c about this MCA
|
||||
param */
|
||||
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
|
||||
|
||||
/* error manager callback function */
|
||||
@ -559,6 +557,11 @@ ORTE_DECLSPEC extern char **orte_fork_agent;
|
||||
|
||||
/* debugger job */
|
||||
ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
|
||||
ORTE_DECLSPEC extern bool orte_debugger_dump_proctable;
|
||||
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
|
||||
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
|
||||
ORTE_DECLSPEC extern bool orte_debugger_enable_fifo_attach;
|
||||
ORTE_DECLSPEC extern int orte_debugger_check_rate;
|
||||
|
||||
/* exit flags */
|
||||
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
|
||||
|
@ -154,6 +154,33 @@ int orte_register_params(void)
|
||||
true, false, 0, &value);
|
||||
orte_in_parallel_debugger = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"output_debugger_proctable",
|
||||
"Whether or not to output the debugger proctable after launch (default: false)",
|
||||
false, false, 0, &value);
|
||||
orte_debugger_dump_proctable = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
|
||||
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
|
||||
false, false, NULL, &orte_debugger_test_daemon);
|
||||
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"debugger_test_attach",
|
||||
"Test debugger colaunch after debugger attachment",
|
||||
false, false, 0, &value);
|
||||
orte_debugger_test_attach = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"debugger_fifo_attach",
|
||||
"Create a fifo to support debugger attachment",
|
||||
false, false, 0, &value);
|
||||
orte_debugger_enable_fifo_attach = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"debugger_check_rate",
|
||||
"Set rate (in secs) for auto-detect of debugger attachment (0 => do not check)",
|
||||
false, false, 0, &orte_debugger_check_rate);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "do_not_launch",
|
||||
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
|
||||
false, false, (int)false, &value);
|
||||
|
@ -49,7 +49,6 @@
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
|
||||
#include "orte/util/session_dir.h"
|
||||
@ -128,9 +127,6 @@ void orte_jobs_complete(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* if the debuggers were run, clean up */
|
||||
orte_debugger.finalize();
|
||||
|
||||
if (0 < orte_routed.num_routes()) {
|
||||
orte_plm.terminate_orteds();
|
||||
}
|
||||
|
@ -69,8 +69,6 @@
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "orte/mca/debugger/debugger.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/oob/oob.h"
|
||||
@ -373,14 +371,6 @@ void orte_info_open_components(void)
|
||||
map->components = &orte_notifier_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
|
||||
if (ORTE_SUCCESS != orte_debugger_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
map = OBJ_NEW(orte_info_component_map_t);
|
||||
map->type = strdup("debugger");
|
||||
map->components = &orte_debugger_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
|
||||
if (ORTE_SUCCESS != mca_oob_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
|
@ -205,7 +205,6 @@ int main(int argc, char *argv[])
|
||||
opal_pointer_array_add(&mca_types, "event");
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
opal_pointer_array_add(&mca_types, "debugger");
|
||||
opal_pointer_array_add(&mca_types, "iof");
|
||||
opal_pointer_array_add(&mca_types, "oob");
|
||||
opal_pointer_array_add(&mca_types, "odls");
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -28,6 +28,12 @@
|
||||
#include <string.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif /* HAVE_STDLIB_H */
|
||||
#ifdef HAVE_STRINGS_H
|
||||
#include <strings.h>
|
||||
#endif /* HAVE_STRINGS_H */
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
@ -46,6 +52,10 @@
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif /* HAVE_SYS_TIME_H */
|
||||
#include <fcntl.h>
|
||||
#ifdef HAVE_SYS_STAT_H
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/installdirs/installdirs.h"
|
||||
@ -75,7 +85,6 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/hnp_contact.h"
|
||||
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
@ -99,6 +108,37 @@
|
||||
|
||||
#include "orterun.h"
|
||||
|
||||
/* instance the standard MPIR interfaces */
|
||||
#define MPIR_MAX_PATH_LENGTH 512
|
||||
#define MPIR_MAX_ARG_LENGTH 1024
|
||||
struct MPIR_PROCDESC *MPIR_proctable = NULL;
|
||||
int MPIR_proctable_size = 0;
|
||||
volatile int MPIR_being_debugged = 0;
|
||||
volatile int MPIR_debug_state = 0;
|
||||
int MPIR_i_am_starter = 0;
|
||||
int MPIR_partial_attach_ok = 1;
|
||||
char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
|
||||
char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
|
||||
volatile int MPIR_forward_output = 0;
|
||||
volatile int MPIR_forward_comm = 0;
|
||||
char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
|
||||
int MPIR_force_to_main = 0;
|
||||
static void orte_debugger_dump(void);
|
||||
static void orte_debugger_init_before_spawn(orte_job_t *jdata);
|
||||
static void orte_debugger_init_after_spawn(orte_job_t *jdata);
|
||||
static void attach_debugger(int fd, short event, void *arg);
|
||||
static void build_debugger_args(orte_app_context_t *debugger);
|
||||
static void open_fifo (void);
|
||||
ORTE_DECLSPEC void* MPIR_Breakpoint(void);
|
||||
|
||||
/*
|
||||
* Breakpoint function for parallel debuggers
|
||||
*/
|
||||
void* MPIR_Breakpoint(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
@ -549,49 +589,6 @@ int orterun(int argc, char *argv[])
|
||||
true);
|
||||
}
|
||||
|
||||
/* force the debugger symbols to be included in orterun.
|
||||
* this is required since the symbols are instantiated in
|
||||
* the orte library, yet they need to be accessed
|
||||
* prior to orte_init when a debugger wants to launch
|
||||
* us
|
||||
*/
|
||||
if (NULL == MPIR_proctable) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (0 == MPIR_proctable_size) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (0 == MPIR_being_debugged) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (0 == MPIR_debug_state) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (0 == MPIR_i_am_starter) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (1 == MPIR_partial_attach_ok) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (NULL == MPIR_executable_path) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (NULL == MPIR_server_arguments) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (0 == MPIR_forward_output) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
if (0 == MPIR_forward_comm) {
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
MPIR_force_to_main = 0;
|
||||
memset(MPIR_attach_fifo, 0, MPIR_MAX_PATH_LENGTH);
|
||||
/* This function call simply ensures that all the symbols --
|
||||
including MPIR_Breakpoint -- are pulled in via the linker from
|
||||
orte/mca/debugger/base/debugger_base_fns.c. */
|
||||
orte_debugger_base_pull_mpir_breakpoint();
|
||||
|
||||
/* Check for some "global" command line params */
|
||||
parse_globals(argc, argv, &cmd_line);
|
||||
OBJ_DESTRUCT(&cmd_line);
|
||||
@ -848,9 +845,15 @@ int orterun(int argc, char *argv[])
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* setup for debugging */
|
||||
orte_debugger_init_before_spawn(jdata);
|
||||
|
||||
/* spawn the job and its daemons */
|
||||
rc = orte_plm.spawn(jdata);
|
||||
|
||||
/* complete debugger interface */
|
||||
orte_debugger_init_after_spawn(jdata);
|
||||
|
||||
/* now wait until the termination event fires */
|
||||
opal_event_dispatch(opal_event_base);
|
||||
|
||||
@ -2110,3 +2113,488 @@ static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
opal_argv_free(new_argv);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/**** DEBUGGER CODE ****/
|
||||
/*
|
||||
* Debugger support for orterun
|
||||
*
|
||||
* We interpret the MPICH debugger interface as follows:
|
||||
*
|
||||
* a) The launcher
|
||||
* - spawns the other processes,
|
||||
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
|
||||
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
|
||||
* - calls MPIR_Breakpoint() which the debugger will have a
|
||||
* breakpoint on.
|
||||
*
|
||||
* b) Applications start and then spin until MPIR_debug_gate is set
|
||||
* non-zero by the debugger.
|
||||
*
|
||||
* This file implements (a).
|
||||
*
|
||||
**************************************************************************
|
||||
*
|
||||
* Note that we have presently tested both TotalView and DDT parallel
|
||||
* debuggers. They both nominally subscribe to the Etnus attaching
|
||||
* interface, but there are differences between the two.
|
||||
*
|
||||
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
|
||||
* TV launches mpirun. mpirun launches the application and then calls
|
||||
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
|
||||
* MPI job. TV then reads the proctable in mpirun and attaches itself
|
||||
* to all the processes (it takes care of launching itself on the
|
||||
* remote nodes). Upon attaching to all the MPI processes, the
|
||||
* variable MPIR_being_debugged is set to 1. When it has finished
|
||||
* attaching itself to all the MPI processes that it wants to,
|
||||
* MPIR_Breakpoint() returns.
|
||||
*
|
||||
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
|
||||
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
|
||||
* X ddt-debugger" (not the lack of other arguments -- we can't pass
|
||||
* anything to mpirun). This app will eventually fork/exec the MPI
|
||||
* app. DDT does not current set MPIR_being_debugged in the MPI app.
|
||||
*
|
||||
**************************************************************************
|
||||
*
|
||||
* We support two ways of waiting for attaching debuggers. The
|
||||
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
|
||||
*
|
||||
* 1. If using orterun: MPI processes will have the
|
||||
* orte_in_parallel_debugger MCA param set to true (because not all
|
||||
* debuggers consistently set MPIR_being_debugged in both the launcher
|
||||
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
|
||||
* then RML send a message to VPID 0 (MCW rank 0) when it returns
|
||||
* (MPIR_Breakpoint() doesn't return until the debugger has attached
|
||||
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
|
||||
* the RML message. All other VPIDs immediately call the grpcomm
|
||||
* barrier (and therefore block until the debugger attaches). Once
|
||||
* VPID 0 receives the RML message, we know that the debugger has
|
||||
* attached to all processes that it cares about, and VPID 0 then
|
||||
* joins the grpcomm barrier, allowing the job to continue. This
|
||||
* scheme has the side effect of nicely supporting partial attaches by
|
||||
* parallel debuggers (i.e., attaching to only some of the MPI
|
||||
* processes; not necessarily all of them).
|
||||
*
|
||||
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
|
||||
* will be true, and we know that there will not be an RML message
|
||||
* sent to VPID 0. So we have to look for a magic environment
|
||||
* variable from the launcher to know if the jobs will be attached by
|
||||
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
|
||||
* MPIR_debug_gate. These environment variable names must be
|
||||
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
|
||||
*/
|
||||
|
||||
/* local globals and functions */
|
||||
static void attach_debugger(int fd, short event, void *arg);
|
||||
static void build_debugger_args(orte_app_context_t *debugger);
|
||||
static void open_fifo(void);
|
||||
static opal_event_t attach;
|
||||
static int attach_fd = -1;
|
||||
static bool fifo_active=false;
|
||||
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
|
||||
#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
|
||||
|
||||
struct MPIR_PROCDESC {
|
||||
char *host_name; /* something that can be passed to inet_addr */
|
||||
char *executable_name; /* name of binary */
|
||||
int pid; /* process pid */
|
||||
};
|
||||
|
||||
|
||||
static void orte_debugger_dump(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
DUMP_INT(MPIR_being_debugged);
|
||||
DUMP_INT(MPIR_debug_state);
|
||||
DUMP_INT(MPIR_partial_attach_ok);
|
||||
DUMP_INT(MPIR_i_am_starter);
|
||||
DUMP_INT(MPIR_forward_output);
|
||||
DUMP_INT(MPIR_proctable_size);
|
||||
fprintf(stderr, " MPIR_proctable:\n");
|
||||
for (i = 0; i < MPIR_proctable_size; i++) {
|
||||
fprintf(stderr,
|
||||
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
|
||||
i,
|
||||
MPIR_proctable[i].host_name,
|
||||
MPIR_proctable[i].executable_name,
|
||||
MPIR_proctable[i].pid);
|
||||
}
|
||||
fprintf(stderr, "MPIR_executable_path: %s\n",
|
||||
('\0' == MPIR_executable_path[0]) ?
|
||||
"NULL" : (char*) MPIR_executable_path);
|
||||
fprintf(stderr, "MPIR_server_arguments: %s\n",
|
||||
('\0' == MPIR_server_arguments[0]) ?
|
||||
"NULL" : (char*) MPIR_server_arguments);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialization of data structures for running under a debugger
|
||||
* using the MPICH/TotalView parallel debugger interface. Before the
|
||||
* spawn we need to check if we are being run under a TotalView-like
|
||||
* debugger; if so then inform applications via an MCA parameter.
|
||||
*/
|
||||
static void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
{
|
||||
char *env_name;
|
||||
orte_app_context_t *app;
|
||||
int i;
|
||||
int32_t ljob;
|
||||
char *attach_fifo;
|
||||
|
||||
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
|
||||
/* if we were given a test debugger, then we still want to
|
||||
* colaunch it
|
||||
*/
|
||||
if (NULL != orte_debugger_test_daemon) {
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s No debugger test daemon specified",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
goto launchit;
|
||||
}
|
||||
/* if we were given an auto-detect rate, then we want to setup
|
||||
* an event so we periodically do the check
|
||||
*/
|
||||
if (0 < orte_debugger_check_rate) {
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s Setting debugger attach check rate for %d seconds",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_debugger_check_rate);
|
||||
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger);
|
||||
} else if (orte_debugger_enable_fifo_attach) {
|
||||
/* create the attachment FIFO and put it into MPIR, setup readevent */
|
||||
/* create a FIFO name in the session dir */
|
||||
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
|
||||
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
|
||||
opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
|
||||
free(attach_fifo);
|
||||
return;
|
||||
}
|
||||
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1);
|
||||
free(attach_fifo);
|
||||
open_fifo();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
launchit:
|
||||
opal_output_verbose(1, orte_debug_output, "Info: Spawned by a debugger");
|
||||
|
||||
/* tell the procs they are being debugged */
|
||||
env_name = mca_base_param_environ_variable("orte",
|
||||
"in_parallel_debugger", NULL);
|
||||
|
||||
for (i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
opal_setenv(env_name, "1", true, &app->env);
|
||||
}
|
||||
free(env_name);
|
||||
|
||||
/* check if we need to co-spawn the debugger daemons */
|
||||
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
|
||||
/* can only have one debugger */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
opal_output(0, "-------------------------------------------\n"
|
||||
"Only one debugger can be used on a job.\n"
|
||||
"-------------------------------------------\n");
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return;
|
||||
}
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s Cospawning debugger daemons %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == orte_debugger_test_daemon) ?
|
||||
MPIR_executable_path : orte_debugger_test_daemon);
|
||||
/* add debugger info to launch message */
|
||||
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
||||
/* create a jobid for these daemons - this is done solely
|
||||
* to avoid confusing the rest of the system's bookkeeping
|
||||
*/
|
||||
orte_plm_base_create_jobid(orte_debugger_daemon);
|
||||
/* flag the job as being debugger daemons */
|
||||
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
||||
/* unless directed, we do not forward output */
|
||||
if (!MPIR_forward_output) {
|
||||
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
}
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
|
||||
/* create an app_context for the debugger daemon */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
if (NULL != orte_debugger_test_daemon) {
|
||||
app->app = strdup(orte_debugger_test_daemon);
|
||||
} else {
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
}
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(orte_debugger_daemon->apps, app);
|
||||
orte_debugger_daemon->num_apps = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initialization of data structures for running under a debugger
|
||||
* using the MPICH/TotalView parallel debugger interface. This stage
|
||||
* of initialization must occur after spawn
|
||||
*
|
||||
* NOTE: We -always- perform this step to ensure that any debugger
|
||||
* that attaches to us post-launch of the application can get a
|
||||
* completed proctable
|
||||
*/
|
||||
static void orte_debugger_init_after_spawn(orte_job_t *jdata)
|
||||
{
|
||||
orte_proc_t *proc;
|
||||
orte_app_context_t *appctx;
|
||||
orte_vpid_t i, j;
|
||||
opal_buffer_t buf;
|
||||
orte_process_name_t rank0;
|
||||
int rc;
|
||||
|
||||
/* if we couldn't get thru the mapper stage, we might
|
||||
* enter here with no procs. Avoid the "zero byte malloc"
|
||||
* message by checking here
|
||||
*/
|
||||
if (MPIR_proctable || 0 == jdata->num_procs) {
|
||||
/* already initialized */
|
||||
opal_output_verbose(5, orte_debug_output,
|
||||
"%s: debugger already initialized or zero procs",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
/* fill in the proc table for the application processes */
|
||||
|
||||
opal_output_verbose(5, orte_debug_output,
|
||||
"%s: Setting up debugger process table for applications",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
MPIR_debug_state = 1;
|
||||
|
||||
/* set the total number of processes in the job */
|
||||
MPIR_proctable_size = jdata->num_procs;
|
||||
|
||||
/* allocate MPIR_proctable */
|
||||
MPIR_proctable = (struct MPIR_PROCDESC *)malloc(sizeof(struct MPIR_PROCDESC) *
|
||||
MPIR_proctable_size);
|
||||
if (MPIR_proctable == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return;
|
||||
}
|
||||
|
||||
if (orte_debugger_dump_proctable) {
|
||||
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
|
||||
}
|
||||
|
||||
/* initialize MPIR_proctable */
|
||||
for (j=0; j < jdata->num_procs; j++) {
|
||||
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
||||
continue;
|
||||
}
|
||||
/* store this data in the location whose index
|
||||
* corresponds to the proc's rank
|
||||
*/
|
||||
i = proc->name.vpid;
|
||||
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
MPIR_proctable[i].host_name = strdup(proc->node->name);
|
||||
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
|
||||
MPIR_proctable[i].executable_name =
|
||||
opal_os_path( false, appctx->app, NULL );
|
||||
} else {
|
||||
MPIR_proctable[i].executable_name =
|
||||
opal_os_path( false, appctx->cwd, appctx->app, NULL );
|
||||
}
|
||||
MPIR_proctable[i].pid = proc->pid;
|
||||
if (orte_debugger_dump_proctable) {
|
||||
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
|
||||
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
|
||||
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
|
||||
}
|
||||
}
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_debug_output)) {
|
||||
orte_debugger_dump();
|
||||
}
|
||||
|
||||
/* if we are being launched under a debugger, then we must wait
|
||||
* for it to be ready to go and do some things to start the job
|
||||
*/
|
||||
if (MPIR_being_debugged) {
|
||||
/* wait for all procs to have reported their contact info - this
|
||||
* ensures that (a) they are all into mpi_init, and (b) the system
|
||||
* has the contact info to successfully send a message to rank=0
|
||||
*/
|
||||
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
|
||||
|
||||
MPIR_Breakpoint();
|
||||
|
||||
/* send a message to rank=0 to release it */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
|
||||
rank0.jobid = jdata->jobid;
|
||||
rank0.vpid = 0;
|
||||
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
|
||||
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
}
|
||||
}
|
||||
|
||||
static void open_fifo (void)
|
||||
{
|
||||
if (attach_fd > 0) {
|
||||
close(attach_fd);
|
||||
}
|
||||
|
||||
attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
|
||||
if (attach_fd < 0) {
|
||||
opal_output(0, "%s unable to open debugger attach fifo",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s Monitoring debugger attach fifo %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
MPIR_attach_fifo);
|
||||
opal_event_set(opal_event_base, &attach, attach_fd, OPAL_EV_READ, attach_debugger, NULL);
|
||||
|
||||
fifo_active = true;
|
||||
opal_event_add(&attach, 0);
|
||||
}
|
||||
|
||||
static void attach_debugger(int fd, short event, void *arg)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
unsigned char fifo_cmd;
|
||||
int rc;
|
||||
int32_t ljob;
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* read the file descriptor to clear that event, if necessary */
|
||||
if (fifo_active) {
|
||||
opal_event_del(&attach);
|
||||
fifo_active = false;
|
||||
|
||||
rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd));
|
||||
if (!rc) {
|
||||
/* reopen device to clear hangup */
|
||||
open_fifo();
|
||||
return;
|
||||
}
|
||||
if (1 != fifo_cmd) {
|
||||
/* ignore the cmd */
|
||||
goto RELEASE;
|
||||
}
|
||||
}
|
||||
|
||||
if (!MPIR_being_debugged && !orte_debugger_test_attach) {
|
||||
/* false alarm */
|
||||
goto RELEASE;
|
||||
}
|
||||
|
||||
opal_output_verbose(1, orte_debug_output,
|
||||
"%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
|
||||
|
||||
/* a debugger has attached! All the MPIR_Proctable
|
||||
* data is already available, so we only need to
|
||||
* check to see if we should spawn any daemons
|
||||
*/
|
||||
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
|
||||
/* can only have one debugger */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
opal_output(0, "-------------------------------------------\n"
|
||||
"Only one debugger can be used on a job.\n"
|
||||
"-------------------------------------------\n");
|
||||
goto RELEASE;
|
||||
}
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s Spawning debugger daemons %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == orte_debugger_test_daemon) ?
|
||||
MPIR_executable_path : orte_debugger_test_daemon);
|
||||
/* this will be launched just like a regular job,
|
||||
* so we do not use the global orte_debugger_daemon
|
||||
* as this is reserved for co-location upon startup
|
||||
*/
|
||||
jdata = OBJ_NEW(orte_job_t);
|
||||
/* create a jobid for these daemons - this is done solely
|
||||
* to avoid confusing the rest of the system's bookkeeping
|
||||
*/
|
||||
orte_plm_base_create_jobid(jdata);
|
||||
/* flag the job as being debugger daemons */
|
||||
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
|
||||
/* unless directed, we do not forward output */
|
||||
if (!MPIR_forward_output) {
|
||||
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
}
|
||||
/* add it to the global job pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
/* create an app_context for the debugger daemon */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
if (NULL != orte_debugger_test_daemon) {
|
||||
app->app = strdup(orte_debugger_test_daemon);
|
||||
} else {
|
||||
app->app = strdup((char*)MPIR_executable_path);
|
||||
}
|
||||
|
||||
jdata->state = ORTE_JOB_STATE_INIT;
|
||||
|
||||
opal_argv_append_nosize(&app->argv, app->app);
|
||||
build_debugger_args(app);
|
||||
opal_pointer_array_add(jdata->apps, app);
|
||||
jdata->num_apps = 1;
|
||||
/* setup the mapping policy to pernode so we get one
|
||||
* daemon on each node
|
||||
*/
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
jdata->map->mapping = ORTE_MAPPING_PPR;
|
||||
jdata->map->ppr = strdup("1:n");
|
||||
/* now go ahead and spawn this job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
RELEASE:
|
||||
/* reset the read or timer event */
|
||||
if (0 == orte_debugger_check_rate) {
|
||||
fifo_active = true;
|
||||
opal_event_add(&attach, 0);
|
||||
} else if (!MPIR_being_debugged) {
|
||||
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger);
|
||||
}
|
||||
|
||||
/* notify the debugger that all is ready */
|
||||
MPIR_Breakpoint();
|
||||
}
|
||||
|
||||
static void build_debugger_args(orte_app_context_t *debugger)
|
||||
{
|
||||
int i, j;
|
||||
char mpir_arg[MPIR_MAX_ARG_LENGTH];
|
||||
|
||||
if ('\0' != MPIR_server_arguments[0]) {
|
||||
j=0;
|
||||
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
|
||||
for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
|
||||
if (MPIR_server_arguments[i] == '\0') {
|
||||
if (0 < j) {
|
||||
opal_argv_append_nosize(&debugger->argv, mpir_arg);
|
||||
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
|
||||
j=0;
|
||||
}
|
||||
} else {
|
||||
mpir_arg[j] = MPIR_server_arguments[i];
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user