1
1

As discussed on today's telecon, reorganize the debugger attachment code in orte to better support efforts within the tool community aimed at exploring alternative methods. Move the debugger attachment code from the orterun directory to a new debugger framework. Organize the existing standard support code into an "mpir" component. Organize the current extensions for co-spawning debugger daemons into a separate "mpirx" component.

Since the MPIR symbols are now included in the ORTE library, remove duplicate declarations in OMPI and replace them with extern references to their ORTE instantiations.

This commit was SVN r23360.
Этот коммит содержится в:
Ralph Castain 2010-07-06 23:35:42 +00:00
родитель a3aba8f2b7
Коммит 31295e8dc2
28 изменённых файлов: 1573 добавлений и 828 удалений

Просмотреть файл

@ -43,7 +43,7 @@ BEGIN_C_DECLS
/**
* Breakpoint function for parallel debuggers.
*/
OMPI_DECLSPEC void *MPIR_Breakpoint(void);
extern void *MPIR_Breakpoint(void);
END_C_DECLS

Просмотреть файл

@ -74,6 +74,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/runtime/orte_globals.h"
#if defined(OMPI_MSGQ_DLL)
@ -125,8 +126,8 @@ OMPI_DECLSPEC opal_datatype_t* opal_datatype_t_type_force_inclusion = NULL;
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_force_inclusion = NULL;
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
OMPI_DECLSPEC volatile int MPIR_debug_state = 0;
extern volatile int MPIR_being_debugged;
extern volatile int MPIR_debug_state;
OMPI_DECLSPEC char *MPIR_debug_abort_string = "";
/* Check for a file in few direct ways for portability */
@ -259,16 +260,6 @@ void ompi_wait_for_debugger(void)
}
}
/*
* Breakpoint function for parallel debuggers. This function is also
* defined in orterun for the starter. It should never conflict with
* this one.
*/
void *MPIR_Breakpoint(void)
{
return NULL;
}
/*
* Tell the debugger that we are about to abort
*/

33
orte/mca/debugger/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,33 @@
#
# Copyright (c) 2010 Cisco Systems, Inc.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_debugger.la
libmca_debugger_la_SOURCES =
# header setup
nobase_orte_HEADERS =
# local files
headers = debugger.h
libmca_debugger_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
nobase_orte_HEADERS += $(headers)
ortedir = $(includedir)/openmpi/orte/mca/debugger
else
ortedir = $(includedir)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

18
orte/mca/debugger/base/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,18 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_debugger_la_SOURCES += \
base/debugger_base_close.c \
base/debugger_base_select.c \
base/debugger_base_open.c \
base/debugger_base_fns.c

73
orte/mca/debugger/base/base.h Обычный файл
Просмотреть файл

@ -0,0 +1,73 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_ALARM_BASE_H
#define MCA_ALARM_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
/*
* function definitions
*/
ORTE_DECLSPEC int orte_debugger_base_open(void);
#if !ORTE_DISABLE_FULL_SUPPORT
ORTE_DECLSPEC int orte_debugger_base_close(void);
ORTE_DECLSPEC int orte_debugger_base_select(void);
ORTE_DECLSPEC void orte_debugger_base_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs);
ORTE_DECLSPEC void orte_debugger_base_init_after_spawn(orte_job_t *jdata);
ORTE_DECLSPEC void orte_debugger_base_dump(void);
ORTE_DECLSPEC extern int orte_debugger_base_output;
ORTE_DECLSPEC extern opal_list_t orte_debugger_base_components_available;
/* +++ begin MPICH/TotalView std debugger interface definitions */
#define MPIR_MAX_PATH_LENGTH 512
#define MPIR_MAX_ARG_LENGTH 1024
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
int pid; /* process pid */
};
ORTE_DECLSPEC extern struct MPIR_PROCDESC *MPIR_proctable;
ORTE_DECLSPEC extern int MPIR_proctable_size;
ORTE_DECLSPEC extern volatile int MPIR_being_debugged;
ORTE_DECLSPEC extern volatile int MPIR_debug_state;
ORTE_DECLSPEC extern volatile int MPIR_i_am_starter;
ORTE_DECLSPEC extern volatile int MPIR_partial_attach_ok;
ORTE_DECLSPEC extern volatile char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
ORTE_DECLSPEC extern volatile char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
ORTE_DECLSPEC extern volatile int MPIR_forward_output;
ORTE_DECLSPEC extern volatile int MPIR_forward_comm;
ORTE_DECLSPEC extern char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
ORTE_DECLSPEC void *MPIR_Breakpoint(void);
/* --- end MPICH/TotalView std debugger interface definitions */
#endif /* !ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS
#endif

Просмотреть файл

@ -0,0 +1,31 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/debugger/debugger.h"
int orte_debugger_base_close(void)
{
if (NULL != orte_debugger.finalize) {
orte_debugger.finalize();
}
/* Close all remaining available components */
mca_base_components_close(orte_debugger_base_output,
&orte_debugger_base_components_available, NULL);
/* All done */
return ORTE_SUCCESS;
}

194
orte/mca/debugger/base/debugger_base_fns.c Обычный файл
Просмотреть файл

@ -0,0 +1,194 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/util/output.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/os_path.h"
#include "opal/util/path.h"
#include "opal/util/opal_environ.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/debugger/base/base.h"
/* instance the standard MPIR interfaces */
struct MPIR_PROCDESC *MPIR_proctable = NULL;
int MPIR_proctable_size = 0;
volatile int MPIR_being_debugged = 0;
volatile int MPIR_debug_state = 0;
volatile int MPIR_i_am_starter = 0;
volatile int MPIR_partial_attach_ok = 1;
volatile char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
volatile char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
volatile int MPIR_forward_output = 0;
volatile int MPIR_forward_comm = 0;
char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
void orte_debugger_base_dump(void)
{
int i;
DUMP_INT(MPIR_being_debugged);
DUMP_INT(MPIR_debug_state);
DUMP_INT(MPIR_partial_attach_ok);
DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_forward_output);
DUMP_INT(MPIR_proctable_size);
fprintf(stderr, " MPIR_proctable:\n");
for (i = 0; i < MPIR_proctable_size; i++) {
fprintf(stderr,
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
i,
MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name,
MPIR_proctable[i].pid);
}
fprintf(stderr, "MPIR_executable_path: %s\n",
('\0' == MPIR_executable_path[0]) ?
"NULL" : (char*) MPIR_executable_path);
fprintf(stderr, "MPIR_server_arguments: %s\n",
('\0' == MPIR_server_arguments[0]) ?
"NULL" : (char*) MPIR_server_arguments);
}
/*
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. This stage
* of initialization must occur after spawn
*
* NOTE: We -always- perform this step to ensure that any debugger
* that attaches to us post-launch of the application can get a
* completed proctable
*/
void orte_debugger_base_init_after_spawn(orte_job_t *jdata)
{
orte_proc_t *proc;
orte_app_context_t *appctx;
orte_vpid_t i, j;
opal_buffer_t buf;
orte_process_name_t rank0;
int rc;
if (MPIR_proctable) {
/* already initialized */
return;
}
/* fill in the proc table for the application processes */
if (orte_debug_flag) {
opal_output(0, "Info: Setting up debugger process table for applications\n");
}
MPIR_debug_state = 1;
/* set the total number of processes in the job */
MPIR_proctable_size = jdata->num_procs;
/* allocate MPIR_proctable */
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
MPIR_proctable_size);
if (MPIR_proctable == NULL) {
opal_output(0, "Error: Out of memory\n");
return;
}
if (orte_output_debugger_proctable) {
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
}
/* initialize MPIR_proctable */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* store this data in the location whose index
* corresponds to the proc's rank
*/
i = proc->name.vpid;
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
continue;
}
MPIR_proctable[i].host_name = strdup(proc->node->name);
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->app, NULL );
} else {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->cwd, appctx->app, NULL );
}
MPIR_proctable[i].pid = proc->pid;
if (orte_output_debugger_proctable) {
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
}
}
if (orte_debug_flag) {
orte_debugger_base_dump();
}
/* if we are being launched under a debugger, then we must wait
* for it to be ready to go and do some things to start the job
*/
if (MPIR_being_debugged) {
/* wait for all procs to have reported their contact info - this
* ensures that (a) they are all into mpi_init, and (b) the system
* has the contact info to successfully send a message to rank=0
*/
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
(void) MPIR_Breakpoint();
/* send a message to rank=0 to release it */
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
rank0.jobid = jdata->jobid;
rank0.vpid = 0;
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
}
OBJ_DESTRUCT(&buf);
}
}
/*
* Breakpoint function for parallel debuggers
*/
void *MPIR_Breakpoint(void)
{
return NULL;
}

73
orte/mca/debugger/base/debugger_base_open.c Обычный файл
Просмотреть файл

@ -0,0 +1,73 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/debugger/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/debugger/base/static-components.h"
/*
* Global variables
*/
int orte_debugger_base_output = -1;
opal_list_t orte_debugger_base_components_available;
orte_debugger_base_module_t orte_debugger;
#if ORTE_DISABLE_FULL_SUPPORT
int orte_debugger_base_open(void)
{
return ORTE_SUCCESS;
}
#else
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_debugger_base_open(void)
{
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_debugger_base_output = opal_output_open(NULL);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("debugger", orte_debugger_base_output,
mca_debugger_base_static_components,
&orte_debugger_base_components_available,
true)) {
return ORTE_ERROR;
}
/* All done */
return ORTE_SUCCESS;
}
#endif

Просмотреть файл

@ -0,0 +1,50 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "orte/mca/debugger/base/base.h"
int orte_debugger_base_select(void)
{
orte_debugger_base_module_t *best_module=NULL;
orte_debugger_base_component_t *best_component=NULL;
int ret;
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("debugger", orte_debugger_base_output,
&orte_debugger_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* This will only happen if no component was selected */
ret = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
/* Save the winner */
/* No global component structure */
orte_debugger = *best_module;
ret = orte_debugger.init();
cleanup:
return ret;
}

77
orte/mca/debugger/debugger.h Обычный файл
Просмотреть файл

@ -0,0 +1,77 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All Rights Reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_DEBUGGER_H
#define MCA_DEBUGGER_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* initialize the selected module */
typedef int (*orte_debugger_base_module_init_fn_t)(void);
/* finalize the selected module */
typedef void (*orte_debugger_base_module_finalize_fn_t)(void);
/* init debuggers before spawn */
typedef void (*orte_debugger_base_module_init_before_spawn_fn_t)(orte_job_t *jdata);
/* init debuggers after spawn */
typedef void (*orte_debugger_base_module_init_after_spawn_fn_t)(orte_job_t *jdata);
/*
* Ver 1.0
*/
struct orte_debugger_base_module_1_0_0_t {
orte_debugger_base_module_init_fn_t init;
orte_debugger_base_module_finalize_fn_t finalize;
orte_debugger_base_module_init_before_spawn_fn_t init_before_spawn;
orte_debugger_base_module_init_after_spawn_fn_t init_after_spawn;
};
typedef struct orte_debugger_base_module_1_0_0_t orte_debugger_base_module_1_0_0_t;
typedef orte_debugger_base_module_1_0_0_t orte_debugger_base_module_t;
ORTE_DECLSPEC extern orte_debugger_base_module_t orte_debugger;
/*
* the standard component data structure
*/
struct orte_debugger_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
};
typedef struct orte_debugger_base_component_1_0_0_t orte_debugger_base_component_1_0_0_t;
typedef orte_debugger_base_component_1_0_0_t orte_debugger_base_component_t;
/*
* Macro for use in components that are of type debugger v1.0.0
*/
#define ORTE_DEBUGGER_BASE_VERSION_1_0_0 \
/* debugger v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* debugger v1.0 */ \
"debugger", 1, 0, 0
END_C_DECLS
#endif /* MCA_DEBUGGER_H */

34
orte/mca/debugger/mpir/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,34 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
mpir.h \
mpir.c \
mpir_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_debugger_mpir_DSO
component_noinst =
component_install = mca_debugger_mpir.la
else
component_noinst = libmca_debugger_mpir.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_debugger_mpir_la_SOURCES = $(sources)
mca_debugger_mpir_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_debugger_mpir_la_SOURCES =$(sources)
libmca_debugger_mpir_la_LDFLAGS = -module -avoid-version

24
orte/mca/debugger/mpir/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

196
orte/mca/debugger/mpir/mpir.c Обычный файл
Просмотреть файл

@ -0,0 +1,196 @@
/* -*- C -*-
*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* Debugger support for orterun
*
* We interpret the MPICH debugger interface as follows:
*
* a) The launcher
* - spawns the other processes,
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
* - calls MPIR_Breakpoint() which the debugger will have a
* breakpoint on.
*
* b) Applications start and then spin until MPIR_debug_gate is set
* non-zero by the debugger.
*
* This file implements (a).
*
**************************************************************************
*
* Note that we have presently tested both TotalView and DDT parallel
* debuggers. They both nominally subscribe to the Etnus attaching
* interface, but there are differences between the two.
*
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
* TV launches mpirun. mpirun launches the application and then calls
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
* MPI job. TV then reads the proctable in mpirun and attaches itself
* to all the processes (it takes care of launching itself on the
* remote nodes). Upon attaching to all the MPI processes, the
* variable MPIR_being_debugged is set to 1. When it has finished
* attaching itself to all the MPI processes that it wants to,
* MPIR_Breakpoint() returns.
*
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
* X ddt-debugger" (not the lack of other arguments -- we can't pass
* anything to mpirun). This app will eventually fork/exec the MPI
* app. DDT does not current set MPIR_being_debugged in the MPI app.
*
**************************************************************************
*
* We support two ways of waiting for attaching debuggers. The
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
*
* 1. If using orterun: MPI processes will have the
* orte_in_parallel_debugger MCA param set to true (because not all
* debuggers consistently set MPIR_being_debugged in both the launcher
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
* then RML send a message to VPID 0 (MCW rank 0) when it returns
* (MPIR_Breakpoint() doesn't return until the debugger has attached
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
* the RML message. All other VPIDs immediately call the grpcomm
* barrier (and therefore block until the debugger attaches). Once
* VPID 0 receives the RML message, we know that the debugger has
* attached to all processes that it cares about, and VPID 0 then
* joins the grpcomm barrier, allowing the job to continue. This
* scheme has the side effect of nicely supporting partial attaches by
* parallel debuggers (i.e., attaching to only some of the MPI
* processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate. These environment variable names must be
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
*/
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/util/opal_sos.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/mca/debugger/base/base.h"
#include "mpir.h"
#include "mpir.h"
/* Static API's */
static int init(void);
static void finalize(void);
static void init_before_spawn(orte_job_t *jdata);
/* Module definition */
orte_debugger_base_module_t orte_debugger_mpir_module = {
init,
finalize,
init_before_spawn,
orte_debugger_base_init_after_spawn
};
/* local globals */
static int init(void)
{
return ORTE_SUCCESS;
}
/**
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void finalize(void)
{
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
return;
}
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -17,22 +17,17 @@
* $HEADER$
*/
#ifndef ORTE_DEBUGGERS_H
#define ORTE_DEBUGGERS_H
#ifndef DEBUGGER_MPIR_H
#define DEBUGGER_MPIR_H
#include "orte_config.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs) __opal_attribute_noreturn__;
int orte_debugger_init_before_spawn(orte_job_t *jdata);
void orte_debugger_init_after_spawn(orte_job_t *jdata);
void orte_debugger_finalize(void);
ORTE_DECLSPEC void *MPIR_Breakpoint(void);
ORTE_MODULE_DECLSPEC extern orte_debugger_base_component_t mca_debugger_mpir_component;
extern orte_debugger_base_module_t orte_debugger_mpir_module;
END_C_DECLS

47
orte/mca/debugger/mpir/mpir_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,47 @@
/* -*- C -*-
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "mpir.h"
static int component_query(mca_base_module_t **module, int *priority);
/*
* Struct of function pointers that need to be initialized
*/
orte_debugger_base_component_t mca_debugger_mpir_component = {
{
ORTE_DEBUGGER_BASE_VERSION_1_0_0,
"mpir", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
NULL,
NULL,
component_query /* module query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int component_query(mca_base_module_t **module, int *priority)
{
*priority = 100;
*module = (mca_base_module_t *)&orte_debugger_mpir_module;
return ORTE_SUCCESS;
}

34
orte/mca/debugger/mpirx/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,34 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
mpirx.h \
mpirx.c \
mpirx_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_debugger_mpirx_DSO
component_noinst =
component_install = mca_debugger_mpirx.la
else
component_noinst = libmca_debugger_mpirx.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_debugger_mpirx_la_SOURCES = $(sources)
mca_debugger_mpirx_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_debugger_mpirx_la_SOURCES =$(sources)
libmca_debugger_mpirx_la_LDFLAGS = -module -avoid-version

24
orte/mca/debugger/mpirx/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

290
orte/mca/debugger/mpirx/mpirx.c Обычный файл
Просмотреть файл

@ -0,0 +1,290 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h"
#endif
#include <sys/stat.h>
#include <ctype.h>
#include <sys/fcntl.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/util/opal_sos.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/mca/debugger/base/base.h"
#include "mpirx.h"
#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
/* Static API's */
static int init(void);
static void finalize(void);
static void init_before_spawn(orte_job_t *jdata);
/* Module definition */
orte_debugger_base_module_t orte_debugger_mpirx_module = {
init,
finalize,
init_before_spawn,
orte_debugger_base_init_after_spawn
};
/* local globals and functions */
static void attach_debugger(int fd, short event, void *arg);
static void build_debugger_args(orte_app_context_t *debugger);
static opal_event_t attach;
static int attach_fd;
static int init(void)
{
return ORTE_SUCCESS;
}
/**
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void finalize(void)
{
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
int32_t ljob;
char *attach_fifo;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* if we were given a test debugger, then we still want to
* colaunch it
*/
if (NULL != orte_debugger_test_daemon) {
goto launchit;
}
/* create the attachment FIFO and put it into MPIR, setup readevent */
memset(&attach,0,sizeof(attach));
/* create a FIFO name in the session dir */
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
opal_output(0, "CANNOT CREATE FIFO");
free(attach_fifo);
return;
}
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH);
attach_fd = open(attach_fifo, O_RDONLY, 0);
free(attach_fifo);
opal_event_set(&attach, attach_fd, OPAL_EV_READ|OPAL_EV_PERSIST, attach_debugger, NULL);
opal_event_add(&attach, 0);
return;
}
launchit:
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
/* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
return;
}
/* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(orte_debugger_daemon);
/* flag the job as being debugger daemons */
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
orte_debugger_daemon->num_apps = 1;
}
return;
}
static void attach_debugger(int fd, short event, void *arg)
{
orte_app_context_t *app;
int rc;
int32_t ljob;
orte_job_t *jdata;
opal_output(0, "ATTACHING DEBUGGER");
if (!MPIR_being_debugged && !orte_debugger_test_attach) {
/* false alarm */
return;
}
if (orte_debug_flag) {
opal_output(0, "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
}
/* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to
* check to see if we should spawn any daemons
*/
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
goto RELEASE;
}
/* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup
*/
jdata = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(jdata);
/* flag the job as being debugger daemons */
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
if (orte_hnp_is_allocated) {
app->num_procs = orte_process_info.num_procs;
} else {
app->num_procs = orte_process_info.num_procs - 1;
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(jdata->apps, &app->super);
jdata->num_apps = 1;
/* setup the mapping policy to bynode so we get one
* daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->policy = ORTE_MAPPING_BYNODE;
/* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
}
}
RELEASE:
/* notify the debugger that all is ready */
MPIR_Breakpoint();
}
static void build_debugger_args(orte_app_context_t *debugger)
{
int i, j;
char mpir_arg[MPIR_MAX_ARG_LENGTH];
if ('\0' != MPIR_server_arguments[0]) {
j=0;
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
if (MPIR_server_arguments[i] == '\0') {
if (0 < j) {
opal_argv_append_nosize(&debugger->argv, mpir_arg);
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
j=0;
}
} else {
mpir_arg[j] = MPIR_server_arguments[i];
j++;
}
}
}
}

25
orte/mca/debugger/mpirx/mpirx.h Обычный файл
Просмотреть файл

@ -0,0 +1,25 @@
/* -*- C -*-
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef DEBUGGER_MPIRX_H
#define DEBUGGER_MPIRX_H
#include "orte_config.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_debugger_base_component_t mca_debugger_mpirx_component;
extern orte_debugger_base_module_t orte_debugger_mpirx_module;
END_C_DECLS
#endif

47
orte/mca/debugger/mpirx/mpirx_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,47 @@
/* -*- C -*-
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "mpirx.h"
static int component_query(mca_base_module_t **module, int *priority);
/*
* Struct of function pointers that need to be initialized
*/
orte_debugger_base_component_t mca_debugger_mpirx_component = {
{
ORTE_DEBUGGER_BASE_VERSION_1_0_0,
"mpirx", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
NULL,
NULL,
component_query /* module query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int component_query(mca_base_module_t **module, int *priority)
{
*priority = 10;
*module = (mca_base_module_t *)&orte_debugger_mpirx_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -41,7 +41,7 @@
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/routed/routed.h"
#include "orte/tools/orterun/debuggers.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
@ -516,7 +516,7 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
ORTE_JOBID_PRINT(job), exit_code));
/* if debuggers are running, clean up */
orte_debugger_finalize();
orte_debugger.finalize();
/* set control params to indicate we are terminating */
orte_job_term_ordered = true;

Просмотреть файл

@ -59,6 +59,7 @@
#include "orte/mca/db/base/base.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
@ -553,6 +554,18 @@ static int rte_init(void)
/* start the local sensors */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* start the debuggers */
if (ORTE_SUCCESS != (ret = orte_debugger_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_debugger_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_debugger_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_debugger_select";
goto error;
}
/* if a tool has launched us and is requesting event reports,
* then set its contact info into the comm system
*/
@ -602,6 +615,9 @@ static int rte_finalize(void)
orte_job_t *job;
int i;
/* stop the debuggers */
orte_debugger_base_close();
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);

Просмотреть файл

@ -85,8 +85,6 @@ char **orted_cmd_line=NULL;
/* debugger flags */
orte_job_t *orte_debugger_daemon=NULL;
bool orte_enable_debug_cospawn_while_running;
int orte_debugger_check_rate;
bool orte_output_debugger_proctable=false;
char *orte_debugger_test_daemon=NULL;
bool orte_debugger_test_attach=false;

Просмотреть файл

@ -592,8 +592,6 @@ ORTE_DECLSPEC extern char **orted_cmd_line;
/* debugger flags */
ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
ORTE_DECLSPEC extern bool orte_enable_debug_cospawn_while_running;
ORTE_DECLSPEC extern int orte_debugger_check_rate;
ORTE_DECLSPEC extern bool orte_output_debugger_proctable;
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
ORTE_DECLSPEC extern bool orte_debugger_test_attach;

Просмотреть файл

@ -148,19 +148,6 @@ int orte_register_params(void)
true, false, 0, &value);
orte_in_parallel_debugger = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte",
"enable_debug_cospawn_while_running",
"Whether a debugger can attach to the job "
"while it is running and request it co-locate debugger daemons (default: false)",
false, false, (int)false, &value);
orte_enable_debug_cospawn_while_running = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte",
"debugger_check_rate",
"How often (in seconds) to check if a debugger "
"has attached to a running job and requested cospawn support (default: 2 sec)",
false, false, 2, &orte_debugger_check_rate);
mca_base_param_reg_int_name("orte",
"output_debugger_proctable",
"Whether or not to output the debugger proctable after launch (default: false)",

Просмотреть файл

@ -51,9 +51,7 @@ endif # OMPI_INSTALL_BINARIES
orterun_SOURCES = \
main.c \
orterun.c \
orterun.h \
debuggers.h \
debuggers.c
orterun.h
orterun_LDADD = $(top_builddir)/orte/libopen-rte.la

Просмотреть файл

@ -1,774 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
/*
* Debugger support for orterun
*
* We interpret the MPICH debugger interface as follows:
*
* a) The launcher
* - spawns the other processes,
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
* - calls MPIR_Breakpoint() which the debugger will have a
* breakpoint on.
*
* b) Applications start and then spin until MPIR_debug_gate is set
* non-zero by the debugger.
*
* This file implements (a).
*
**************************************************************************
*
* Note that we have presently tested both TotalView and DDT parallel
* debuggers. They both nominally subscribe to the Etnus attaching
* interface, but there are differences between the two.
*
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
* TV launches mpirun. mpirun launches the application and then calls
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
* MPI job. TV then reads the proctable in mpirun and attaches itself
* to all the processes (it takes care of launching itself on the
* remote nodes). Upon attaching to all the MPI processes, the
* variable MPIR_being_debugged is set to 1. When it has finished
* attaching itself to all the MPI processes that it wants to,
* MPIR_Breakpoint() returns.
*
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
* X ddt-debugger" (not the lack of other arguments -- we can't pass
* anything to mpirun). This app will eventually fork/exec the MPI
* app. DDT does not current set MPIR_being_debugged in the MPI app.
*
**************************************************************************
*
* We support two ways of waiting for attaching debuggers. The
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
*
* 1. If using orterun: MPI processes will have the
* orte_in_parallel_debugger MCA param set to true (because not all
* debuggers consistently set MPIR_being_debugged in both the launcher
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
* then RML send a message to VPID 0 (MCW rank 0) when it returns
* (MPIR_Breakpoint() doesn't return until the debugger has attached
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
* the RML message. All other VPIDs immediately call the grpcomm
* barrier (and therefore block until the debugger attaches). Once
* VPID 0 receives the RML message, we know that the debugger has
* attached to all processes that it cares about, and VPID 0 then
* joins the grpcomm barrier, allowing the job to continue. This
* scheme has the side effect of nicely supporting partial attaches by
* parallel debuggers (i.e., attaching to only some of the MPI
* processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate. These environment variable names must be
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
*/
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/util/opal_sos.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "debuggers.h"
/* +++ begin MPICH/TotalView std debugger interface definitions */
#define MPIR_MAX_PATH_LENGTH 256
#define MPIR_MAX_ARG_LENGTH 1024
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
int pid; /* process pid */
};
struct MPIR_PROCDESC *MPIR_proctable = NULL;
int MPIR_proctable_size = 0;
volatile int MPIR_being_debugged = 0;
volatile int MPIR_debug_state = 0;
volatile int MPIR_i_am_starter = 0;
volatile int MPIR_partial_attach_ok = 1;
volatile char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
volatile char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
volatile int MPIR_forward_output = 0;
volatile int MPIR_forward_comm = 0;
/* --- end MPICH/TotalView std debugger interface definitions */
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
static void dump(void)
{
int i;
DUMP_INT(MPIR_being_debugged);
DUMP_INT(MPIR_debug_state);
DUMP_INT(MPIR_partial_attach_ok);
DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_forward_output);
DUMP_INT(MPIR_proctable_size);
fprintf(stderr, " MPIR_proctable:\n");
for (i = 0; i < MPIR_proctable_size; i++) {
fprintf(stderr,
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
i,
MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name,
MPIR_proctable[i].pid);
}
fprintf(stderr, "MPIR_executable_path: %s\n",
('\0' == MPIR_executable_path[0]) ?
"NULL" : (char*) MPIR_executable_path);
fprintf(stderr, "MPIR_server_arguments: %s\n",
('\0' == MPIR_server_arguments[0]) ?
"NULL" : (char*) MPIR_server_arguments);
}
/*
* Process one line from the orte_base_user_debugger MCA param and
* look for that debugger in the path. If we find it, fill in
* new_argv.
*/
static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
int argc, char **argv, char ***new_argv, int num_procs)
{
int i;
char *line, *full_line = strdup(orig_line);
char *user_argv, *tmp, *tmp2, **tmp_argv, **executable;
char cwd[OPAL_PATH_MAX];
bool used_num_procs = false;
bool single_app = false;
bool fail_needed_executable = false;
line = full_line;
if (NULL == line) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* Trim off whitespace at the beginning and ending of line */
for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) {
continue;
}
for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) {
line[i] = '\0';
}
if (strlen(line) <= 0) {
return ORTE_ERROR;
}
/* Get the tail of the command line (i.e., the user executable /
argv) */
opal_cmd_line_get_tail(cmd_line, &i, &executable);
/* Remove --debug, --debugger, and -tv from the user command line
params */
if (1 == argc) {
user_argv = strdup("");
} else {
tmp_argv = opal_argv_copy(argv);
for (i = 0; NULL != tmp_argv[i]; ++i) {
if (0 == strcmp(tmp_argv[i], "-debug") ||
0 == strcmp(tmp_argv[i], "--debug")) {
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
} else if (0 == strcmp(tmp_argv[i], "-tv") ||
0 == strcmp(tmp_argv[i], "--tv")) {
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
} else if (0 == strcmp(tmp_argv[i], "--debugger") ||
0 == strcmp(tmp_argv[i], "-debugger")) {
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
if (NULL != tmp_argv[i + 1]) {
++i;
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
}
}
}
user_argv = opal_argv_join(tmp_argv + 1, ' ');
opal_argv_free(tmp_argv);
}
/* Replace @@ tokens - line should never realistically be bigger
than MAX_INT, so just cast to int to remove compiler warning */
for (i = 0; i < (int) strlen(line); ++i) {
tmp = NULL;
if (0 == strncmp(line + i, "@mpirun@", 8)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, argv[0], line + i + 8);
} else if (0 == strncmp(line + i, "@orterun@", 9)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, argv[0], line + i + 9);
} else if (0 == strncmp(line + i, "@mpirun_args@", 13)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, user_argv, line + i + 13);
} else if (0 == strncmp(line + i, "@orterun_args@", 14)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, user_argv, line + i + 14);
} else if (0 == strncmp(line + i, "@np@", 4)) {
line[i] = '\0';
asprintf(&tmp, "%s%d%s", line, num_procs,
line + i + 4);
used_num_procs = true;
} else if (0 == strncmp(line + i, "@single_app@", 12)) {
line[i] = '\0';
/* This token is only a flag; it is not replaced with any
alternate text */
asprintf(&tmp, "%s%s", line, line + i + 12);
single_app = true;
} else if (0 == strncmp(line + i, "@executable@", 12)) {
line[i] = '\0';
/* If we found the executable, paste it in. Otherwise,
this is a possible error. */
if (NULL != executable) {
asprintf(&tmp, "%s%s%s", line, executable[0], line + i + 12);
} else {
fail_needed_executable = true;
}
} else if (0 == strncmp(line + i, "@executable_argv@", 17)) {
line[i] = '\0';
/* If we found the tail, paste in the argv. Otherwise,
this is a possible error. */
if (NULL != executable) {
if (NULL != executable[1]) {
/* Put in the argv */
tmp2 = opal_argv_join(executable + 1, ' ');
asprintf(&tmp, "%s%s%s", line, tmp2, line + i + 17);
free(tmp2);
} else {
/* There is no argv; just paste the front and back
together, removing the @token@ */
asprintf(&tmp, "%s%s", line, line + i + 17);
}
} else {
fail_needed_executable = true;
}
}
if (NULL != tmp) {
free(full_line);
full_line = line = tmp;
--i;
}
}
/* Split up into argv */
*new_argv = opal_argv_split(line, ' ');
free(full_line);
/* Can we find argv[0] in the path? */
getcwd(cwd, OPAL_PATH_MAX);
tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd);
if (NULL != tmp) {
free(tmp);
/* Ok, we found a good debugger. Check for some error
conditions. */
tmp = opal_argv_join(argv, ' ');
/* We do not support launching a debugger that requires the
-np value if the user did not specify -np on the command
line. */
if (used_num_procs && 0 == num_procs) {
orte_show_help("help-orterun.txt", "debugger requires -np",
true, (*new_argv)[0], argv[0], user_argv,
(*new_argv)[0]);
/* Fall through to free / fail, below */
}
/* Some debuggers do not support launching MPMD */
else if (single_app && NULL != strchr(tmp, ':')) {
orte_show_help("help-orterun.txt",
"debugger only accepts single app", true,
(*new_argv)[0], (*new_argv)[0]);
/* Fall through to free / fail, below */
}
/* Some debuggers do not use orterun/mpirun, and therefore
must have an executable to run (e.g., cannot use mpirun's
app context file feature). */
else if (fail_needed_executable) {
orte_show_help("help-orterun.txt",
"debugger requires executable", true,
(*new_argv)[0], argv[0], (*new_argv)[0], argv[0],
(*new_argv)[0]);
/* Fall through to free / fail, below */
}
/* Otherwise, we succeeded. Return happiness. */
else {
free(tmp);
return ORTE_SUCCESS;
}
free(tmp);
}
/* All done -- didn't find it */
opal_argv_free(*new_argv);
*new_argv = NULL;
return ORTE_ERR_NOT_FOUND;
}
/**
* Run a user-level debugger
*/
void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs)
{
int i, id;
char **new_argv = NULL;
char *value, **lines, *env_name;
/* Get the orte_base_debug MCA parameter and search for a debugger
that can run */
id = mca_base_param_find("orte", NULL, "base_user_debugger");
if (id < 0) {
orte_show_help("help-orterun.txt", "debugger-mca-param-not-found",
true);
exit(1);
}
value = NULL;
mca_base_param_lookup_string(id, &value);
if (NULL == value) {
orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty",
true);
exit(1);
}
/* Look through all the values in the MCA param */
lines = opal_argv_split(value, ':');
free(value);
for (i = 0; NULL != lines[i]; ++i) {
if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv,
&new_argv, num_procs)) {
break;
}
}
/* If we didn't find one, abort */
if (NULL == lines[i]) {
orte_show_help("help-orterun.txt", "debugger-not-found", true);
exit(1);
}
opal_argv_free(lines);
/* We found one */
/* cleanup the MPIR arrays in case the debugger doesn't set them */
memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH);
memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH);
/* Set an MCA param so that everyone knows that they are being
launched under a debugger; not all debuggers are consistent
about setting MPIR_being_debugged in both the launcher and the
MPI processes */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
if (NULL != env_name) {
opal_setenv(env_name, "1", true, &environ);
free(env_name);
}
/* Launch the debugger */
execvp(new_argv[0], new_argv);
value = opal_argv_join(new_argv, ' ');
orte_show_help("help-orterun.txt", "debugger-exec-failed",
true, basename, value, new_argv[0]);
free(value);
opal_argv_free(new_argv);
exit(1);
}
static void build_debugger_args(orte_app_context_t *debugger)
{
int i, j;
char mpir_arg[MPIR_MAX_ARG_LENGTH];
if ('\0' != MPIR_server_arguments[0]) {
j=0;
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
if (MPIR_server_arguments[i] == '\0') {
if (0 < j) {
opal_argv_append_nosize(&debugger->argv, mpir_arg);
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
j=0;
}
} else {
mpir_arg[j] = MPIR_server_arguments[i];
j++;
}
}
}
}
static void check_debugger(int fd, short event, void *arg)
{
struct timeval now;
opal_event_t *tmp = (opal_event_t*)arg;
orte_app_context_t *app;
int rc;
int32_t ljob;
orte_job_t *jdata;
if (MPIR_being_debugged || orte_debugger_test_attach) {
if (orte_debug_flag) {
opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
}
/* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to
* check to see if we should spawn any daemons
*/
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
goto RELEASE;
}
/* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup
*/
jdata = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(jdata);
/* flag the job as being debugger daemons */
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
if (orte_hnp_is_allocated) {
app->num_procs = orte_process_info.num_procs;
} else {
app->num_procs = orte_process_info.num_procs - 1;
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(jdata->apps, &app->super);
jdata->num_apps = 1;
/* setup the mapping policy to bynode so we get one
* daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->policy = ORTE_MAPPING_BYNODE;
/* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
}
}
RELEASE:
/* notify the debugger that all is ready */
MPIR_Breakpoint();
} else {
/* reissue the timer to wake us up again */
now.tv_sec = orte_debugger_check_rate;
now.tv_usec = 0;
opal_evtimer_add(tmp, &now);
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
int orte_debugger_init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
int32_t ljob;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* not being debugged - check if we want to enable
* later attachment by debugger
*/
if (orte_enable_debug_cospawn_while_running) {
/* setup a timer to wake us up periodically
* to check for debugger attach
*/
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger);
return ORTE_SUCCESS;
}
/* if we were given a test debugger, then we still want to
* colaunch it
*/
if (NULL != orte_debugger_test_daemon) {
goto launchit;
}
return ORTE_SUCCESS;
}
launchit:
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
/* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
return ORTE_ERROR;
}
/* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(orte_debugger_daemon);
/* flag the job as being debugger daemons */
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, &app->super);
orte_debugger_daemon->num_apps = 1;
}
return ORTE_SUCCESS;
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. This stage
* of initialization must occur after spawn
*
* NOTE: We -always- perform this step to ensure that any debugger
* that attaches to us post-launch of the application can get a
* completed proctable
*/
void orte_debugger_init_after_spawn(orte_job_t *jdata)
{
orte_proc_t *proc;
orte_app_context_t *appctx;
orte_vpid_t i, j;
opal_buffer_t buf;
orte_process_name_t rank0;
int rc;
if (MPIR_proctable) {
/* already initialized */
return;
}
/* fill in the proc table for the application processes */
if (orte_debug_flag) {
opal_output(0, "Info: Setting up debugger process table for applications\n");
}
MPIR_debug_state = 1;
/* set the total number of processes in the job */
MPIR_proctable_size = jdata->num_procs;
/* allocate MPIR_proctable */
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
MPIR_proctable_size);
if (MPIR_proctable == NULL) {
opal_output(0, "Error: Out of memory\n");
return;
}
if (orte_output_debugger_proctable) {
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
}
/* initialize MPIR_proctable */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* store this data in the location whose index
* corresponds to the proc's rank
*/
i = proc->name.vpid;
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
continue;
}
MPIR_proctable[i].host_name = strdup(proc->node->name);
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->app, NULL );
} else {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->cwd, appctx->app, NULL );
}
MPIR_proctable[i].pid = proc->pid;
if (orte_output_debugger_proctable) {
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
}
}
if (orte_debug_flag) {
dump();
}
/* if we are being launched under a debugger, then we must wait
* for it to be ready to go and do some things to start the job
*/
if (MPIR_being_debugged) {
/* wait for all procs to have reported their contact info - this
* ensures that (a) they are all into mpi_init, and (b) the system
* has the contact info to successfully send a message to rank=0
*/
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
(void) MPIR_Breakpoint();
/* send a message to rank=0 to release it */
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
rank0.jobid = jdata->jobid;
rank0.vpid = 0;
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
}
OBJ_DESTRUCT(&buf);
}
}
/**
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void orte_debugger_finalize(void)
{
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}
/**
* Breakpoint function for parallel debuggers
*/
void *MPIR_Breakpoint(void)
{
return NULL;
}

Просмотреть файл

@ -76,6 +76,8 @@
#include "orte/util/session_dir.h"
#include "orte/util/hnp_contact.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rml/rml.h"
@ -94,7 +96,6 @@
/* ensure I can behave like a daemon */
#include "orte/orted/orted.h"
#include "debuggers.h"
#include "orterun.h"
/*
@ -468,6 +469,8 @@ static int parse_appfile(char *filename, char ***env);
static void dump_aborted_procs(void);
static void just_quit(int fd, short ign, void *arg);
static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs);
int orterun(int argc, char *argv[])
{
@ -814,15 +817,13 @@ int orterun(int argc, char *argv[])
}
/* setup for debugging */
if (ORTE_SUCCESS != orte_debugger_init_before_spawn(jdata)) {
goto DONE;
}
orte_debugger.init_before_spawn(jdata);
/* Spawn the job */
rc = orte_plm.spawn(jdata);
/* complete debugger interface */
orte_debugger_init_after_spawn(jdata);
orte_debugger.init_after_spawn(jdata);
/* now wait until the termination event fires */
opal_event_dispatch();
@ -896,7 +897,7 @@ static void job_completed(int trigpipe, short event, void *arg)
}
/* if the debuggers were run, clean up */
orte_debugger_finalize();
orte_debugger.finalize();
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
/* since we know that the sends didn't completely go out,
@ -1229,7 +1230,7 @@ static void abort_exit_callback(int fd, short ign, void *arg)
jdata->jobid != ORTE_JOBID_INVALID &&
!orte_never_launched) {
/* if the debuggers were run, clean up */
orte_debugger_finalize();
orte_debugger.finalize();
/*
* Turn off the process recovery functionality, if it was enabled.
@ -1469,7 +1470,7 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line)
/* Do we want a user-level debugger? */
if (orterun_globals.debugger) {
orte_run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs);
run_debugger(orterun_basename, cmd_line, argc, argv, orterun_globals.num_procs);
}
/* extract any rank assignment policy directives */
@ -2337,3 +2338,268 @@ static int parse_appfile(char *filename, char ***env)
free(filename);
return ORTE_SUCCESS;
}
/*
* Process one line from the orte_base_user_debugger MCA param and
* look for that debugger in the path. If we find it, fill in
* new_argv.
*/
static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
int argc, char **argv, char ***new_argv, int num_procs)
{
int i;
char *line, *full_line = strdup(orig_line);
char *user_argv, *tmp, *tmp2, **tmp_argv, **executable;
char cwd[OPAL_PATH_MAX];
bool used_num_procs = false;
bool single_app = false;
bool fail_needed_executable = false;
line = full_line;
if (NULL == line) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* Trim off whitespace at the beginning and ending of line */
for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) {
continue;
}
for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) {
line[i] = '\0';
}
if (strlen(line) <= 0) {
return ORTE_ERROR;
}
/* Get the tail of the command line (i.e., the user executable /
argv) */
opal_cmd_line_get_tail(cmd_line, &i, &executable);
/* Remove --debug, --debugger, and -tv from the user command line
params */
if (1 == argc) {
user_argv = strdup("");
} else {
tmp_argv = opal_argv_copy(argv);
for (i = 0; NULL != tmp_argv[i]; ++i) {
if (0 == strcmp(tmp_argv[i], "-debug") ||
0 == strcmp(tmp_argv[i], "--debug")) {
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
} else if (0 == strcmp(tmp_argv[i], "-tv") ||
0 == strcmp(tmp_argv[i], "--tv")) {
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
} else if (0 == strcmp(tmp_argv[i], "--debugger") ||
0 == strcmp(tmp_argv[i], "-debugger")) {
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
if (NULL != tmp_argv[i + 1]) {
++i;
free(tmp_argv[i]);
tmp_argv[i] = strdup("");
}
}
}
user_argv = opal_argv_join(tmp_argv + 1, ' ');
opal_argv_free(tmp_argv);
}
/* Replace @@ tokens - line should never realistically be bigger
than MAX_INT, so just cast to int to remove compiler warning */
for (i = 0; i < (int) strlen(line); ++i) {
tmp = NULL;
if (0 == strncmp(line + i, "@mpirun@", 8)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, argv[0], line + i + 8);
} else if (0 == strncmp(line + i, "@orterun@", 9)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, argv[0], line + i + 9);
} else if (0 == strncmp(line + i, "@mpirun_args@", 13)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, user_argv, line + i + 13);
} else if (0 == strncmp(line + i, "@orterun_args@", 14)) {
line[i] = '\0';
asprintf(&tmp, "%s%s%s", line, user_argv, line + i + 14);
} else if (0 == strncmp(line + i, "@np@", 4)) {
line[i] = '\0';
asprintf(&tmp, "%s%d%s", line, num_procs,
line + i + 4);
used_num_procs = true;
} else if (0 == strncmp(line + i, "@single_app@", 12)) {
line[i] = '\0';
/* This token is only a flag; it is not replaced with any
alternate text */
asprintf(&tmp, "%s%s", line, line + i + 12);
single_app = true;
} else if (0 == strncmp(line + i, "@executable@", 12)) {
line[i] = '\0';
/* If we found the executable, paste it in. Otherwise,
this is a possible error. */
if (NULL != executable) {
asprintf(&tmp, "%s%s%s", line, executable[0], line + i + 12);
} else {
fail_needed_executable = true;
}
} else if (0 == strncmp(line + i, "@executable_argv@", 17)) {
line[i] = '\0';
/* If we found the tail, paste in the argv. Otherwise,
this is a possible error. */
if (NULL != executable) {
if (NULL != executable[1]) {
/* Put in the argv */
tmp2 = opal_argv_join(executable + 1, ' ');
asprintf(&tmp, "%s%s%s", line, tmp2, line + i + 17);
free(tmp2);
} else {
/* There is no argv; just paste the front and back
together, removing the @token@ */
asprintf(&tmp, "%s%s", line, line + i + 17);
}
} else {
fail_needed_executable = true;
}
}
if (NULL != tmp) {
free(full_line);
full_line = line = tmp;
--i;
}
}
/* Split up into argv */
*new_argv = opal_argv_split(line, ' ');
free(full_line);
/* Can we find argv[0] in the path? */
getcwd(cwd, OPAL_PATH_MAX);
tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd);
if (NULL != tmp) {
free(tmp);
/* Ok, we found a good debugger. Check for some error
conditions. */
tmp = opal_argv_join(argv, ' ');
/* We do not support launching a debugger that requires the
-np value if the user did not specify -np on the command
line. */
if (used_num_procs && 0 == num_procs) {
orte_show_help("help-orterun.txt", "debugger requires -np",
true, (*new_argv)[0], argv[0], user_argv,
(*new_argv)[0]);
/* Fall through to free / fail, below */
}
/* Some debuggers do not support launching MPMD */
else if (single_app && NULL != strchr(tmp, ':')) {
orte_show_help("help-orterun.txt",
"debugger only accepts single app", true,
(*new_argv)[0], (*new_argv)[0]);
/* Fall through to free / fail, below */
}
/* Some debuggers do not use orterun/mpirun, and therefore
must have an executable to run (e.g., cannot use mpirun's
app context file feature). */
else if (fail_needed_executable) {
orte_show_help("help-orterun.txt",
"debugger requires executable", true,
(*new_argv)[0], argv[0], (*new_argv)[0], argv[0],
(*new_argv)[0]);
/* Fall through to free / fail, below */
}
/* Otherwise, we succeeded. Return happiness. */
else {
free(tmp);
return ORTE_SUCCESS;
}
free(tmp);
}
/* All done -- didn't find it */
opal_argv_free(*new_argv);
*new_argv = NULL;
return ORTE_ERR_NOT_FOUND;
}
/**
* Run a user-level debugger
*/
static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs)
{
int i, id;
char **new_argv = NULL;
char *value, **lines, *env_name;
/* Get the orte_base_debug MCA parameter and search for a debugger
that can run */
id = mca_base_param_find("orte", NULL, "base_user_debugger");
if (id < 0) {
orte_show_help("help-orterun.txt", "debugger-mca-param-not-found",
true);
exit(1);
}
value = NULL;
mca_base_param_lookup_string(id, &value);
if (NULL == value) {
orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty",
true);
exit(1);
}
/* Look through all the values in the MCA param */
lines = opal_argv_split(value, ':');
free(value);
for (i = 0; NULL != lines[i]; ++i) {
if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv,
&new_argv, num_procs)) {
break;
}
}
/* If we didn't find one, abort */
if (NULL == lines[i]) {
orte_show_help("help-orterun.txt", "debugger-not-found", true);
exit(1);
}
opal_argv_free(lines);
/* We found one */
/* cleanup the MPIR arrays in case the debugger doesn't set them */
memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH);
memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH);
/* Set an MCA param so that everyone knows that they are being
launched under a debugger; not all debuggers are consistent
about setting MPIR_being_debugged in both the launcher and the
MPI processes */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
if (NULL != env_name) {
opal_setenv(env_name, "1", true, &environ);
free(env_name);
}
/* Launch the debugger */
execvp(new_argv[0], new_argv);
value = opal_argv_join(new_argv, ' ');
orte_show_help("help-orterun.txt", "debugger-exec-failed",
true, basename, value, new_argv[0]);
free(value);
opal_argv_free(new_argv);
exit(1);
}