1
1

My apologies for doing this outside of the usual time restrictions, but we need to get this in so we can make progress.

Move the ORTE-level debugger code back into orterun and out of the ORTE library to resolve symbol conflicts.

This commit was SVN r25713.
This commit is contained in:
Ralph Castain 2012-01-11 15:53:09 +00:00
parent 686ee387c8
commit bf103de66c
35 changed files with 587 additions and 1557 deletions

View File

@ -27,7 +27,6 @@
#define OMPI_DEBUGGERS_H
#include "ompi_config.h"
#include "orte/mca/debugger/base/base.h"
BEGIN_C_DECLS
@ -43,8 +42,10 @@ extern void ompi_debugger_notify_abort(char *string);
/**
* Breakpoint function for parallel debuggers.
* This function is also defined in orterun for the starter.
* It should never conflict with this one
*/
OMPI_DECLSPEC extern void MPIR_Breakpoint(void);
OMPI_DECLSPEC void* MPIR_Breakpoint(void);
END_C_DECLS

View File

@ -74,7 +74,6 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/runtime/orte_globals.h"
#if defined(OMPI_MSGQ_DLL)
@ -126,8 +125,8 @@ OMPI_DECLSPEC opal_datatype_t* opal_datatype_t_type_force_inclusion = NULL;
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_force_inclusion = NULL;
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
OMPI_DECLSPEC extern volatile int MPIR_being_debugged;
OMPI_DECLSPEC extern volatile int MPIR_debug_state;
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
OMPI_DECLSPEC volatile int MPIR_debug_state = 0;
OMPI_DECLSPEC char *MPIR_debug_abort_string = "";
/* Check for a file in few direct ways for portability */
@ -283,3 +282,13 @@ void ompi_debugger_notify_abort(char *reason)
/* Now tell the debugger */
MPIR_Breakpoint();
}
/*
* Breakpoint function for parallel debuggers. This function is also
* defined in orterun for the starter. It should never conflict with
* this
*/
void* MPIR_Breakpoint(void)
{
return NULL;
}

View File

@ -110,8 +110,6 @@
#if !ORTE_DISABLE_FULL_SUPPORT
#include "orte/mca/notifier/notifier.h"
#include "orte/mca/notifier/base/base.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/oob/oob.h"
@ -431,14 +429,6 @@ void ompi_info_open_components(void)
map->components = &orte_notifier_base_components_available;
opal_pointer_array_add(&component_map, map);
if (ORTE_SUCCESS != orte_debugger_base_open()) {
goto error;
}
map = OBJ_NEW(ompi_info_component_map_t);
map->type = strdup("debugger");
map->components = &orte_debugger_base_components_available;
opal_pointer_array_add(&component_map, map);
if (ORTE_SUCCESS != mca_oob_base_open()) {
goto error;
}

View File

@ -233,7 +233,6 @@ int main(int argc, char *argv[])
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
opal_pointer_array_add(&mca_types, "debugger");
opal_pointer_array_add(&mca_types, "iof");
opal_pointer_array_add(&mca_types, "oob");
opal_pointer_array_add(&mca_types, "odls");

View File

@ -1,28 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_debugger.la
libmca_debugger_la_SOURCES =
# local files
headers = debugger.h
libmca_debugger_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(includedir)/openmpi/$(subdir)
nobase_orte_HEADERS = $(headers)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

View File

@ -1,18 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_debugger_la_SOURCES += \
base/debugger_base_close.c \
base/debugger_base_select.c \
base/debugger_base_open.c \
base/debugger_base_fns.c

View File

@ -1,82 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_DEBUGGER_BASE_H
#define MCA_DEBUGGER_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
typedef struct {
int output;
bool dump_proctable;
char *test_daemon;
bool test_attach;
} orte_debugger_base_t;
ORTE_DECLSPEC extern orte_debugger_base_t orte_debugger_base;
/*
* function definitions
*/
ORTE_DECLSPEC int orte_debugger_base_open(void);
ORTE_DECLSPEC int orte_debugger_base_close(void);
ORTE_DECLSPEC int orte_debugger_base_select(void);
ORTE_DECLSPEC void orte_debugger_base_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs);
ORTE_DECLSPEC void orte_debugger_base_init_after_spawn(orte_job_t *jdata);
ORTE_DECLSPEC void orte_debugger_base_dump(void);
ORTE_DECLSPEC extern opal_list_t orte_debugger_base_components_available;
/* +++ begin MPICH/TotalView std debugger interface definitions */
#define MPIR_MAX_PATH_LENGTH 512
#define MPIR_MAX_ARG_LENGTH 1024
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
int pid; /* process pid */
};
ORTE_DECLSPEC extern struct MPIR_PROCDESC *MPIR_proctable;
ORTE_DECLSPEC extern int MPIR_proctable_size;
ORTE_DECLSPEC extern volatile int MPIR_being_debugged;
ORTE_DECLSPEC extern volatile int MPIR_debug_state;
ORTE_DECLSPEC extern int MPIR_i_am_starter;
ORTE_DECLSPEC extern int MPIR_partial_attach_ok;
ORTE_DECLSPEC extern char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
ORTE_DECLSPEC extern char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
ORTE_DECLSPEC extern volatile int MPIR_forward_output;
ORTE_DECLSPEC extern volatile int MPIR_forward_comm;
ORTE_DECLSPEC extern char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
ORTE_DECLSPEC extern int MPIR_force_to_main;
typedef void (*orte_debugger_breakpoint_fn_t)(void);
ORTE_DECLSPEC void MPIR_Breakpoint(void);
ORTE_DECLSPEC void orte_debugger_base_pull_mpir_breakpoint(void);
/* --- end MPICH/TotalView std debugger interface definitions */
END_C_DECLS
#endif

View File

@ -1,33 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/debugger/debugger.h"
int orte_debugger_base_close(void)
{
#if !ORTE_DISABLE_FULL_SUPPORT
if (NULL != orte_debugger.finalize) {
orte_debugger.finalize();
}
/* Close all remaining available components */
mca_base_components_close(orte_debugger_base.output,
&orte_debugger_base_components_available, NULL);
#endif
/* All done */
return ORTE_SUCCESS;
}

View File

@ -1,201 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/util/output.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/os_path.h"
#include "opal/util/path.h"
#include "opal/util/opal_environ.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/debugger/base/base.h"
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
#if !ORTE_DISABLE_FULL_SUPPORT
void orte_debugger_base_dump(void)
{
int i;
DUMP_INT(MPIR_being_debugged);
DUMP_INT(MPIR_debug_state);
DUMP_INT(MPIR_partial_attach_ok);
DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_forward_output);
DUMP_INT(MPIR_proctable_size);
fprintf(stderr, " MPIR_proctable:\n");
for (i = 0; i < MPIR_proctable_size; i++) {
fprintf(stderr,
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
i,
MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name,
MPIR_proctable[i].pid);
}
fprintf(stderr, "MPIR_executable_path: %s\n",
('\0' == MPIR_executable_path[0]) ?
"NULL" : (char*) MPIR_executable_path);
fprintf(stderr, "MPIR_server_arguments: %s\n",
('\0' == MPIR_server_arguments[0]) ?
"NULL" : (char*) MPIR_server_arguments);
}
/*
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. This stage
* of initialization must occur after spawn
*
* NOTE: We -always- perform this step to ensure that any debugger
* that attaches to us post-launch of the application can get a
* completed proctable
*/
void orte_debugger_base_init_after_spawn(orte_job_t *jdata)
{
orte_proc_t *proc;
orte_app_context_t *appctx;
orte_vpid_t i, j;
opal_buffer_t buf;
orte_process_name_t rank0;
int rc;
/* if we couldn't get thru the mapper stage, we might
* enter here with no procs. Avoid the "zero byte malloc"
* message by checking here
*/
if (MPIR_proctable || 0 == jdata->num_procs) {
/* already initialized */
opal_output_verbose(5, orte_debugger_base.output,
"%s: debugger already initialized or zero procs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
/* fill in the proc table for the application processes */
opal_output_verbose(5, orte_debugger_base.output,
"%s: Setting up debugger process table for applications",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
MPIR_debug_state = 1;
/* set the total number of processes in the job */
MPIR_proctable_size = jdata->num_procs;
/* allocate MPIR_proctable */
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
MPIR_proctable_size);
if (MPIR_proctable == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
if (orte_debugger_base.dump_proctable) {
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
}
/* initialize MPIR_proctable */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* store this data in the location whose index
* corresponds to the proc's rank
*/
i = proc->name.vpid;
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
continue;
}
MPIR_proctable[i].host_name = strdup(proc->node->name);
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->app, NULL );
} else {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->cwd, appctx->app, NULL );
}
MPIR_proctable[i].pid = proc->pid;
if (orte_debugger_base.dump_proctable) {
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
}
}
if (0 < opal_output_get_verbosity(orte_debugger_base.output)) {
orte_debugger_base_dump();
}
/* if we are being launched under a debugger, then we must wait
* for it to be ready to go and do some things to start the job
*/
if (MPIR_being_debugged) {
/* wait for all procs to have reported their contact info - this
* ensures that (a) they are all into mpi_init, and (b) the system
* has the contact info to successfully send a message to rank=0
*/
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
MPIR_Breakpoint();
/* send a message to rank=0 to release it */
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
rank0.jobid = jdata->jobid;
rank0.vpid = 0;
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
}
OBJ_DESTRUCT(&buf);
}
}
#endif
/*
* Dummy function so that the linker can pull in all the symbols from
* this file.
*/
void orte_debugger_base_pull_mpir_breakpoint(void)
{
return;
}
/*
* Breakpoint function for parallel debuggers
*/
void MPIR_Breakpoint(void)
{
return;
}

View File

@ -1,105 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/debugger/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/debugger/base/static-components.h"
/*
* Global variables
*/
orte_debugger_base_t orte_debugger_base;
opal_list_t orte_debugger_base_components_available;
orte_debugger_base_module_t orte_debugger;
/* instance the standard MPIR interfaces */
struct MPIR_PROCDESC *MPIR_proctable = NULL;
int MPIR_proctable_size = 0;
volatile int MPIR_being_debugged = 0;
volatile int MPIR_debug_state = 0;
int MPIR_i_am_starter = 0;
int MPIR_partial_attach_ok = 1;
char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
volatile int MPIR_forward_output = 0;
volatile int MPIR_forward_comm = 0;
char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
int MPIR_force_to_main = 0;
#if ORTE_DISABLE_FULL_SUPPORT
int orte_debugger_base_open(void)
{
return ORTE_SUCCESS;
}
#else
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_debugger_base_open(void)
{
int value;
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_debugger_base.output = opal_output_open(NULL);
mca_base_param_reg_int_name("orte",
"output_debugger_proctable",
"Whether or not to output the debugger proctable after launch (default: false)",
true, false, 0, &value);
orte_debugger_base.dump_proctable = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
false, false, NULL, &orte_debugger_base.test_daemon);
mca_base_param_reg_int_name("orte",
"debugger_test_attach",
"Test debugger colaunch after debugger attachment",
false, false, 0, &value);
orte_debugger_base.test_attach = OPAL_INT_TO_BOOL(value);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("debugger", orte_debugger_base.output,
mca_debugger_base_static_components,
&orte_debugger_base_components_available,
true)) {
return ORTE_ERROR;
}
/* All done */
return ORTE_SUCCESS;
}
#endif

View File

@ -1,54 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "orte/mca/debugger/base/base.h"
int orte_debugger_base_select(void)
{
#if !ORTE_DISABLE_FULL_SUPPORT
orte_debugger_base_module_t *best_module=NULL;
orte_debugger_base_component_t *best_component=NULL;
int ret;
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("debugger", orte_debugger_base.output,
&orte_debugger_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* This will only happen if no component was selected */
ret = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
/* Save the winner */
/* No global component structure */
orte_debugger = *best_module;
ret = orte_debugger.init();
cleanup:
return ret;
#else
return ORTE_ERR_NOT_IMPLEMENTED;
#endif
}

View File

@ -1,77 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All Rights Reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_DEBUGGER_H
#define MCA_DEBUGGER_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* initialize the selected module */
typedef int (*orte_debugger_base_module_init_fn_t)(void);
/* finalize the selected module */
typedef void (*orte_debugger_base_module_finalize_fn_t)(void);
/* init debuggers before spawn */
typedef void (*orte_debugger_base_module_init_before_spawn_fn_t)(orte_job_t *jdata);
/* init debuggers after spawn */
typedef void (*orte_debugger_base_module_init_after_spawn_fn_t)(orte_job_t *jdata);
/*
* Ver 1.0
*/
struct orte_debugger_base_module_1_0_0_t {
orte_debugger_base_module_init_fn_t init;
orte_debugger_base_module_finalize_fn_t finalize;
orte_debugger_base_module_init_before_spawn_fn_t init_before_spawn;
orte_debugger_base_module_init_after_spawn_fn_t init_after_spawn;
};
typedef struct orte_debugger_base_module_1_0_0_t orte_debugger_base_module_1_0_0_t;
typedef orte_debugger_base_module_1_0_0_t orte_debugger_base_module_t;
ORTE_DECLSPEC extern orte_debugger_base_module_t orte_debugger;
/*
* the standard component data structure
*/
struct orte_debugger_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
};
typedef struct orte_debugger_base_component_1_0_0_t orte_debugger_base_component_1_0_0_t;
typedef orte_debugger_base_component_1_0_0_t orte_debugger_base_component_t;
/*
* Macro for use in components that are of type debugger v1.0.0
*/
#define ORTE_DEBUGGER_BASE_VERSION_1_0_0 \
/* debugger v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* debugger v1.0 */ \
"debugger", 1, 0, 0
END_C_DECLS
#endif /* MCA_DEBUGGER_H */

View File

@ -1,12 +0,0 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte Ws2_32.lib

View File

@ -1,36 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
mpir.h \
mpir.c \
mpir_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_debugger_mpir_DSO
component_noinst =
component_install = mca_debugger_mpir.la
else
component_noinst = libmca_debugger_mpir.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_debugger_mpir_la_SOURCES = $(sources)
mca_debugger_mpir_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_debugger_mpir_la_SOURCES =$(sources)
libmca_debugger_mpir_la_LDFLAGS = -module -avoid-version

View File

@ -1,19 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_debugger_mpir_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_debugger_mpir_CONFIG], [
AC_CONFIG_FILES([orte/mca/debugger/mpir/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

View File

@ -1,196 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* Debugger support for orterun
*
* We interpret the MPICH debugger interface as follows:
*
* a) The launcher
* - spawns the other processes,
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
* - calls MPIR_Breakpoint() which the debugger will have a
* breakpoint on.
*
* b) Applications start and then spin until MPIR_debug_gate is set
* non-zero by the debugger.
*
* This file implements (a).
*
**************************************************************************
*
* Note that we have presently tested both TotalView and DDT parallel
* debuggers. They both nominally subscribe to the Etnus attaching
* interface, but there are differences between the two.
*
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
* TV launches mpirun. mpirun launches the application and then calls
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
* MPI job. TV then reads the proctable in mpirun and attaches itself
* to all the processes (it takes care of launching itself on the
* remote nodes). Upon attaching to all the MPI processes, the
* variable MPIR_being_debugged is set to 1. When it has finished
* attaching itself to all the MPI processes that it wants to,
* MPIR_Breakpoint() returns.
*
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
* X ddt-debugger" (not the lack of other arguments -- we can't pass
* anything to mpirun). This app will eventually fork/exec the MPI
* app. DDT does not current set MPIR_being_debugged in the MPI app.
*
**************************************************************************
*
* We support two ways of waiting for attaching debuggers. The
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
*
* 1. If using orterun: MPI processes will have the
* orte_in_parallel_debugger MCA param set to true (because not all
* debuggers consistently set MPIR_being_debugged in both the launcher
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
* then RML send a message to VPID 0 (MCW rank 0) when it returns
* (MPIR_Breakpoint() doesn't return until the debugger has attached
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
* the RML message. All other VPIDs immediately call the grpcomm
* barrier (and therefore block until the debugger attaches). Once
* VPID 0 receives the RML message, we know that the debugger has
* attached to all processes that it cares about, and VPID 0 then
* joins the grpcomm barrier, allowing the job to continue. This
* scheme has the side effect of nicely supporting partial attaches by
* parallel debuggers (i.e., attaching to only some of the MPI
* processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate. These environment variable names must be
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/mca/debugger/base/base.h"
#include "mpir.h"
#include "mpir.h"
/* Static API's */
static int init(void);
static void finalize(void);
static void init_before_spawn(orte_job_t *jdata);
/* Module definition */
orte_debugger_base_module_t orte_debugger_mpir_module = {
init,
finalize,
init_before_spawn,
orte_debugger_base_init_after_spawn
};
/* local globals */
static int init(void)
{
return ORTE_SUCCESS;
}
/**
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void finalize(void)
{
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
return;
}
opal_output_verbose(1, orte_debugger_base.output, "Info: Spawned by a debugger");
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
}

View File

@ -1,34 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef DEBUGGER_MPIR_H
#define DEBUGGER_MPIR_H
#include "orte_config.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_debugger_base_component_t mca_debugger_mpir_component;
extern orte_debugger_base_module_t orte_debugger_mpir_module;
END_C_DECLS
#endif /* ORTE_DEBUGGERS_H */

View File

@ -1,47 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "mpir.h"
static int component_query(mca_base_module_t **module, int *priority);
/*
* Struct of function pointers that need to be initialized
*/
orte_debugger_base_component_t mca_debugger_mpir_component = {
{
ORTE_DEBUGGER_BASE_VERSION_1_0_0,
"mpir", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
NULL,
NULL,
component_query /* module query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int component_query(mca_base_module_t **module, int *priority)
{
*priority = 100;
*module = (mca_base_module_t *)&orte_debugger_mpir_module;
return ORTE_SUCCESS;
}

View File

@ -1,34 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
mpirx.h \
mpirx.c \
mpirx_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_debugger_mpirx_DSO
component_noinst =
component_install = mca_debugger_mpirx.la
else
component_noinst = libmca_debugger_mpirx.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_debugger_mpirx_la_SOURCES = $(sources)
mca_debugger_mpirx_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_debugger_mpirx_la_SOURCES =$(sources)
libmca_debugger_mpirx_la_LDFLAGS = -module -avoid-version

View File

@ -1,19 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_debugger_mpirx_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_debugger_mpirx_CONFIG], [
AC_CONFIG_FILES([orte/mca/debugger/mpirx/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

View File

@ -1,366 +0,0 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include <sys/stat.h>
#include <ctype.h>
#include <fcntl.h>
#include <errno.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "opal/mca/event/event.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/mca/debugger/base/base.h"
#include "mpirx.h"
#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
/* Static API's */
static int init(void);
static void finalize(void);
static void init_before_spawn(orte_job_t *jdata);
/* Module definition */
orte_debugger_base_module_t orte_debugger_mpirx_module = {
init,
finalize,
init_before_spawn,
orte_debugger_base_init_after_spawn
};
/* local globals and functions */
static void attach_debugger(int fd, short event, void *arg);
static void build_debugger_args(orte_app_context_t *debugger);
static void open_fifo(void);
static opal_event_t attach;
static int attach_fd = -1;
static bool fifo_active=false;
static int init(void)
{
return ORTE_SUCCESS;
}
/**
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void finalize(void)
{
if (fifo_active) {
opal_event_del(&attach);
close(attach_fd);
}
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}
/**
* Initialization of data structures for running under a debugger
* using an extended MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
int32_t ljob;
char *attach_fifo;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* if we were given a test debugger, then we still want to
* colaunch it
*/
if (NULL != orte_debugger_base.test_daemon) {
opal_output_verbose(2, orte_debugger_base.output,
"%s No debugger test daemon specified",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
goto launchit;
}
/* if we were given an auto-detect rate, then we want to setup
* an event so we periodically do the check
*/
if (0 < orte_debugger_mpirx_check_rate) {
opal_output_verbose(2, orte_debugger_base.output,
"%s Setting debugger attach check rate for %d seconds",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_debugger_mpirx_check_rate);
ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
} else {
/* create the attachment FIFO and put it into MPIR, setup readevent */
/* create a FIFO name in the session dir */
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
free(attach_fifo);
return;
}
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1);
free (attach_fifo);
open_fifo ();
}
return;
}
launchit:
opal_output_verbose(2, orte_debugger_base.output,
"%s: Spawned by a debugger",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
/* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
return;
}
opal_output_verbose(2, orte_debugger_base.output,
"%s Cospawning debugger daemons %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_base.test_daemon) ?
MPIR_executable_path : orte_debugger_base.test_daemon);
/* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(orte_debugger_daemon);
/* flag the job as being debugger daemons */
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_base.test_daemon) {
app->app = strdup(orte_debugger_base.test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, app);
orte_debugger_daemon->num_apps = 1;
}
return;
}
static void open_fifo (void)
{
if (attach_fd > 0) {
close(attach_fd);
}
attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
if (attach_fd < 0) {
opal_output(0, "%s unable to open debugger attach fifo",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
opal_output_verbose(2, orte_debugger_base.output,
"%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
opal_event_set(opal_event_base, &attach, attach_fd, OPAL_EV_READ, attach_debugger, NULL);
fifo_active = true;
opal_event_add(&attach, 0);
}
static void attach_debugger(int fd, short event, void *arg)
{
orte_app_context_t *app;
unsigned char fifo_cmd;
int rc;
int32_t ljob;
orte_job_t *jdata;
/* read the file descriptor to clear that event, if necessary */
if (fifo_active) {
opal_event_del(&attach);
fifo_active = false;
rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd));
if (!rc) {
/* reopen device to clear hangup */
open_fifo();
return;
}
if (1 != fifo_cmd) {
/* ignore the cmd */
goto RELEASE;
}
}
if (!MPIR_being_debugged && !orte_debugger_base.test_attach) {
/* false alarm */
goto RELEASE;
}
opal_output_verbose(1, orte_debugger_base.output,
"%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon);
/* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to
* check to see if we should spawn any daemons
*/
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
goto RELEASE;
}
opal_output_verbose(2, orte_debugger_base.output,
"%s Spawning debugger daemons %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_base.test_daemon) ?
MPIR_executable_path : orte_debugger_base.test_daemon);
/* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup
*/
jdata = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(jdata);
/* flag the job as being debugger daemons */
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_base.test_daemon) {
app->app = strdup(orte_debugger_base.test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
jdata->state = ORTE_JOB_STATE_INIT;
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(jdata->apps, app);
jdata->num_apps = 1;
/* setup the mapping policy to pernode so we get one
* daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->mapping = ORTE_MAPPING_PPR;
jdata->map->ppr = strdup("1:n");
/* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
}
}
RELEASE:
/* reset the read or timer event */