1
1

My apologies for doing this outside of the usual time restrictions, but we need to get this in so we can make progress.

Move the ORTE-level debugger code back into orterun and out of the ORTE library to resolve symbol conflicts.

This commit was SVN r25713.
Этот коммит содержится в:
Ralph Castain 2012-01-11 15:53:09 +00:00
родитель 686ee387c8
Коммит bf103de66c
35 изменённых файлов: 587 добавлений и 1557 удалений

Просмотреть файл

@ -27,7 +27,6 @@
#define OMPI_DEBUGGERS_H
#include "ompi_config.h"
#include "orte/mca/debugger/base/base.h"
BEGIN_C_DECLS
@ -43,8 +42,10 @@ extern void ompi_debugger_notify_abort(char *string);
/**
* Breakpoint function for parallel debuggers.
* This function is also defined in orterun for the starter.
* It should never conflict with this one
*/
OMPI_DECLSPEC extern void MPIR_Breakpoint(void);
OMPI_DECLSPEC void* MPIR_Breakpoint(void);
END_C_DECLS

Просмотреть файл

@ -74,7 +74,6 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/runtime/orte_globals.h"
#if defined(OMPI_MSGQ_DLL)
@ -126,8 +125,8 @@ OMPI_DECLSPEC opal_datatype_t* opal_datatype_t_type_force_inclusion = NULL;
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_force_inclusion = NULL;
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
OMPI_DECLSPEC extern volatile int MPIR_being_debugged;
OMPI_DECLSPEC extern volatile int MPIR_debug_state;
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
OMPI_DECLSPEC volatile int MPIR_debug_state = 0;
OMPI_DECLSPEC char *MPIR_debug_abort_string = "";
/* Check for a file in few direct ways for portability */
@ -283,3 +282,13 @@ void ompi_debugger_notify_abort(char *reason)
/* Now tell the debugger */
MPIR_Breakpoint();
}
/*
* Breakpoint function for parallel debuggers. This function is also
* defined in orterun for the starter. It should never conflict with
* this
*/
void* MPIR_Breakpoint(void)
{
return NULL;
}

Просмотреть файл

@ -110,8 +110,6 @@
#if !ORTE_DISABLE_FULL_SUPPORT
#include "orte/mca/notifier/notifier.h"
#include "orte/mca/notifier/base/base.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/oob/oob.h"
@ -431,14 +429,6 @@ void ompi_info_open_components(void)
map->components = &orte_notifier_base_components_available;
opal_pointer_array_add(&component_map, map);
if (ORTE_SUCCESS != orte_debugger_base_open()) {
goto error;
}
map = OBJ_NEW(ompi_info_component_map_t);
map->type = strdup("debugger");
map->components = &orte_debugger_base_components_available;
opal_pointer_array_add(&component_map, map);
if (ORTE_SUCCESS != mca_oob_base_open()) {
goto error;
}

Просмотреть файл

@ -233,7 +233,6 @@ int main(int argc, char *argv[])
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
opal_pointer_array_add(&mca_types, "debugger");
opal_pointer_array_add(&mca_types, "iof");
opal_pointer_array_add(&mca_types, "oob");
opal_pointer_array_add(&mca_types, "odls");

Просмотреть файл

@ -1,28 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# main library setup
noinst_LTLIBRARIES = libmca_debugger.la
libmca_debugger_la_SOURCES =
# local files
headers = debugger.h
libmca_debugger_la_SOURCES += $(headers)
# Conditionally install the header files
if WANT_INSTALL_HEADERS
ortedir = $(includedir)/openmpi/$(subdir)
nobase_orte_HEADERS = $(headers)
endif
include base/Makefile.am
distclean-local:
rm -f base/static-components.h

Просмотреть файл

@ -1,18 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
headers += \
base/base.h
libmca_debugger_la_SOURCES += \
base/debugger_base_close.c \
base/debugger_base_select.c \
base/debugger_base_open.c \
base/debugger_base_fns.c

Просмотреть файл

@ -1,82 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef MCA_DEBUGGER_BASE_H
#define MCA_DEBUGGER_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
typedef struct {
int output;
bool dump_proctable;
char *test_daemon;
bool test_attach;
} orte_debugger_base_t;
ORTE_DECLSPEC extern orte_debugger_base_t orte_debugger_base;
/*
* function definitions
*/
ORTE_DECLSPEC int orte_debugger_base_open(void);
ORTE_DECLSPEC int orte_debugger_base_close(void);
ORTE_DECLSPEC int orte_debugger_base_select(void);
ORTE_DECLSPEC void orte_debugger_base_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
int argc, char *argv[], int num_procs);
ORTE_DECLSPEC void orte_debugger_base_init_after_spawn(orte_job_t *jdata);
ORTE_DECLSPEC void orte_debugger_base_dump(void);
ORTE_DECLSPEC extern opal_list_t orte_debugger_base_components_available;
/* +++ begin MPICH/TotalView std debugger interface definitions */
#define MPIR_MAX_PATH_LENGTH 512
#define MPIR_MAX_ARG_LENGTH 1024
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
int pid; /* process pid */
};
ORTE_DECLSPEC extern struct MPIR_PROCDESC *MPIR_proctable;
ORTE_DECLSPEC extern int MPIR_proctable_size;
ORTE_DECLSPEC extern volatile int MPIR_being_debugged;
ORTE_DECLSPEC extern volatile int MPIR_debug_state;
ORTE_DECLSPEC extern int MPIR_i_am_starter;
ORTE_DECLSPEC extern int MPIR_partial_attach_ok;
ORTE_DECLSPEC extern char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
ORTE_DECLSPEC extern char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
ORTE_DECLSPEC extern volatile int MPIR_forward_output;
ORTE_DECLSPEC extern volatile int MPIR_forward_comm;
ORTE_DECLSPEC extern char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
ORTE_DECLSPEC extern int MPIR_force_to_main;
typedef void (*orte_debugger_breakpoint_fn_t)(void);
ORTE_DECLSPEC void MPIR_Breakpoint(void);
ORTE_DECLSPEC void orte_debugger_base_pull_mpir_breakpoint(void);
/* --- end MPICH/TotalView std debugger interface definitions */
END_C_DECLS
#endif

Просмотреть файл

@ -1,33 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/debugger/debugger.h"
int orte_debugger_base_close(void)
{
#if !ORTE_DISABLE_FULL_SUPPORT
if (NULL != orte_debugger.finalize) {
orte_debugger.finalize();
}
/* Close all remaining available components */
mca_base_components_close(orte_debugger_base.output,
&orte_debugger_base_components_available, NULL);
#endif
/* All done */
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,201 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/util/output.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/os_path.h"
#include "opal/util/path.h"
#include "opal/util/opal_environ.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/debugger/base/base.h"
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
#if !ORTE_DISABLE_FULL_SUPPORT
void orte_debugger_base_dump(void)
{
int i;
DUMP_INT(MPIR_being_debugged);
DUMP_INT(MPIR_debug_state);
DUMP_INT(MPIR_partial_attach_ok);
DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_forward_output);
DUMP_INT(MPIR_proctable_size);
fprintf(stderr, " MPIR_proctable:\n");
for (i = 0; i < MPIR_proctable_size; i++) {
fprintf(stderr,
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
i,
MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name,
MPIR_proctable[i].pid);
}
fprintf(stderr, "MPIR_executable_path: %s\n",
('\0' == MPIR_executable_path[0]) ?
"NULL" : (char*) MPIR_executable_path);
fprintf(stderr, "MPIR_server_arguments: %s\n",
('\0' == MPIR_server_arguments[0]) ?
"NULL" : (char*) MPIR_server_arguments);
}
/*
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. This stage
* of initialization must occur after spawn
*
* NOTE: We -always- perform this step to ensure that any debugger
* that attaches to us post-launch of the application can get a
* completed proctable
*/
void orte_debugger_base_init_after_spawn(orte_job_t *jdata)
{
orte_proc_t *proc;
orte_app_context_t *appctx;
orte_vpid_t i, j;
opal_buffer_t buf;
orte_process_name_t rank0;
int rc;
/* if we couldn't get thru the mapper stage, we might
* enter here with no procs. Avoid the "zero byte malloc"
* message by checking here
*/
if (MPIR_proctable || 0 == jdata->num_procs) {
/* already initialized */
opal_output_verbose(5, orte_debugger_base.output,
"%s: debugger already initialized or zero procs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
/* fill in the proc table for the application processes */
opal_output_verbose(5, orte_debugger_base.output,
"%s: Setting up debugger process table for applications",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
MPIR_debug_state = 1;
/* set the total number of processes in the job */
MPIR_proctable_size = jdata->num_procs;
/* allocate MPIR_proctable */
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
MPIR_proctable_size);
if (MPIR_proctable == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
if (orte_debugger_base.dump_proctable) {
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
}
/* initialize MPIR_proctable */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* store this data in the location whose index
* corresponds to the proc's rank
*/
i = proc->name.vpid;
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
continue;
}
MPIR_proctable[i].host_name = strdup(proc->node->name);
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->app, NULL );
} else {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->cwd, appctx->app, NULL );
}
MPIR_proctable[i].pid = proc->pid;
if (orte_debugger_base.dump_proctable) {
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
}
}
if (0 < opal_output_get_verbosity(orte_debugger_base.output)) {
orte_debugger_base_dump();
}
/* if we are being launched under a debugger, then we must wait
* for it to be ready to go and do some things to start the job
*/
if (MPIR_being_debugged) {
/* wait for all procs to have reported their contact info - this
* ensures that (a) they are all into mpi_init, and (b) the system
* has the contact info to successfully send a message to rank=0
*/
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
MPIR_Breakpoint();
/* send a message to rank=0 to release it */
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
rank0.jobid = jdata->jobid;
rank0.vpid = 0;
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
}
OBJ_DESTRUCT(&buf);
}
}
#endif
/*
* Dummy function so that the linker can pull in all the symbols from
* this file.
*/
void orte_debugger_base_pull_mpir_breakpoint(void)
{
return;
}
/*
* Breakpoint function for parallel debuggers
*/
void MPIR_Breakpoint(void)
{
return;
}

Просмотреть файл

@ -1,105 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "orte/mca/debugger/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "orte/mca/debugger/base/static-components.h"
/*
* Global variables
*/
orte_debugger_base_t orte_debugger_base;
opal_list_t orte_debugger_base_components_available;
orte_debugger_base_module_t orte_debugger;
/* instance the standard MPIR interfaces */
struct MPIR_PROCDESC *MPIR_proctable = NULL;
int MPIR_proctable_size = 0;
volatile int MPIR_being_debugged = 0;
volatile int MPIR_debug_state = 0;
int MPIR_i_am_starter = 0;
int MPIR_partial_attach_ok = 1;
char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
volatile int MPIR_forward_output = 0;
volatile int MPIR_forward_comm = 0;
char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
int MPIR_force_to_main = 0;
#if ORTE_DISABLE_FULL_SUPPORT
int orte_debugger_base_open(void)
{
return ORTE_SUCCESS;
}
#else
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
int orte_debugger_base_open(void)
{
int value;
/* Debugging / verbose output. Always have stream open, with
verbose set by the mca open system... */
orte_debugger_base.output = opal_output_open(NULL);
mca_base_param_reg_int_name("orte",
"output_debugger_proctable",
"Whether or not to output the debugger proctable after launch (default: false)",
true, false, 0, &value);
orte_debugger_base.dump_proctable = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
false, false, NULL, &orte_debugger_base.test_daemon);
mca_base_param_reg_int_name("orte",
"debugger_test_attach",
"Test debugger colaunch after debugger attachment",
false, false, 0, &value);
orte_debugger_base.test_attach = OPAL_INT_TO_BOOL(value);
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("debugger", orte_debugger_base.output,
mca_debugger_base_static_components,
&orte_debugger_base_components_available,
true)) {
return ORTE_ERROR;
}
/* All done */
return ORTE_SUCCESS;
}
#endif

Просмотреть файл

@ -1,54 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "orte/mca/debugger/base/base.h"
int orte_debugger_base_select(void)
{
#if !ORTE_DISABLE_FULL_SUPPORT
orte_debugger_base_module_t *best_module=NULL;
orte_debugger_base_component_t *best_component=NULL;
int ret;
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("debugger", orte_debugger_base.output,
&orte_debugger_base_components_available,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* This will only happen if no component was selected */
ret = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
/* Save the winner */
/* No global component structure */
orte_debugger = *best_module;
ret = orte_debugger.init();
cleanup:
return ret;
#else
return ORTE_ERR_NOT_IMPLEMENTED;
#endif
}

Просмотреть файл

@ -1,77 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All Rights Reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_DEBUGGER_H
#define MCA_DEBUGGER_H
/*
* includes
*/
#include "orte_config.h"
#include "opal/mca/mca.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS
/*
* Component functions - all MUST be provided!
*/
/* initialize the selected module */
typedef int (*orte_debugger_base_module_init_fn_t)(void);
/* finalize the selected module */
typedef void (*orte_debugger_base_module_finalize_fn_t)(void);
/* init debuggers before spawn */
typedef void (*orte_debugger_base_module_init_before_spawn_fn_t)(orte_job_t *jdata);
/* init debuggers after spawn */
typedef void (*orte_debugger_base_module_init_after_spawn_fn_t)(orte_job_t *jdata);
/*
* Ver 1.0
*/
struct orte_debugger_base_module_1_0_0_t {
orte_debugger_base_module_init_fn_t init;
orte_debugger_base_module_finalize_fn_t finalize;
orte_debugger_base_module_init_before_spawn_fn_t init_before_spawn;
orte_debugger_base_module_init_after_spawn_fn_t init_after_spawn;
};
typedef struct orte_debugger_base_module_1_0_0_t orte_debugger_base_module_1_0_0_t;
typedef orte_debugger_base_module_1_0_0_t orte_debugger_base_module_t;
ORTE_DECLSPEC extern orte_debugger_base_module_t orte_debugger;
/*
* the standard component data structure
*/
struct orte_debugger_base_component_1_0_0_t {
mca_base_component_t base_version;
mca_base_component_data_t base_data;
};
typedef struct orte_debugger_base_component_1_0_0_t orte_debugger_base_component_1_0_0_t;
typedef orte_debugger_base_component_1_0_0_t orte_debugger_base_component_t;
/*
* Macro for use in components that are of type debugger v1.0.0
*/
#define ORTE_DEBUGGER_BASE_VERSION_1_0_0 \
/* debugger v1.0 is chained to MCA v2.0 */ \
MCA_BASE_VERSION_2_0_0, \
/* debugger v1.0 */ \
"debugger", 1, 0, 0
END_C_DECLS
#endif /* MCA_DEBUGGER_H */

Просмотреть файл

@ -1,12 +0,0 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte Ws2_32.lib

Просмотреть файл

@ -1,36 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
mpir.h \
mpir.c \
mpir_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_debugger_mpir_DSO
component_noinst =
component_install = mca_debugger_mpir.la
else
component_noinst = libmca_debugger_mpir.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_debugger_mpir_la_SOURCES = $(sources)
mca_debugger_mpir_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_debugger_mpir_la_SOURCES =$(sources)
libmca_debugger_mpir_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,19 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_debugger_mpir_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_debugger_mpir_CONFIG], [
AC_CONFIG_FILES([orte/mca/debugger/mpir/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

Просмотреть файл

@ -1,196 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* Debugger support for orterun
*
* We interpret the MPICH debugger interface as follows:
*
* a) The launcher
* - spawns the other processes,
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
* - calls MPIR_Breakpoint() which the debugger will have a
* breakpoint on.
*
* b) Applications start and then spin until MPIR_debug_gate is set
* non-zero by the debugger.
*
* This file implements (a).
*
**************************************************************************
*
* Note that we have presently tested both TotalView and DDT parallel
* debuggers. They both nominally subscribe to the Etnus attaching
* interface, but there are differences between the two.
*
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
* TV launches mpirun. mpirun launches the application and then calls
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
* MPI job. TV then reads the proctable in mpirun and attaches itself
* to all the processes (it takes care of launching itself on the
* remote nodes). Upon attaching to all the MPI processes, the
* variable MPIR_being_debugged is set to 1. When it has finished
* attaching itself to all the MPI processes that it wants to,
* MPIR_Breakpoint() returns.
*
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
* X ddt-debugger" (not the lack of other arguments -- we can't pass
* anything to mpirun). This app will eventually fork/exec the MPI
* app. DDT does not current set MPIR_being_debugged in the MPI app.
*
**************************************************************************
*
* We support two ways of waiting for attaching debuggers. The
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
*
* 1. If using orterun: MPI processes will have the
* orte_in_parallel_debugger MCA param set to true (because not all
* debuggers consistently set MPIR_being_debugged in both the launcher
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
* then RML send a message to VPID 0 (MCW rank 0) when it returns
* (MPIR_Breakpoint() doesn't return until the debugger has attached
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
* the RML message. All other VPIDs immediately call the grpcomm
* barrier (and therefore block until the debugger attaches). Once
* VPID 0 receives the RML message, we know that the debugger has
* attached to all processes that it cares about, and VPID 0 then
* joins the grpcomm barrier, allowing the job to continue. This
* scheme has the side effect of nicely supporting partial attaches by
* parallel debuggers (i.e., attaching to only some of the MPI
* processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate. These environment variable names must be
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <ctype.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/mca/debugger/base/base.h"
#include "mpir.h"
#include "mpir.h"
/* Static API's */
static int init(void);
static void finalize(void);
static void init_before_spawn(orte_job_t *jdata);
/* Module definition */
orte_debugger_base_module_t orte_debugger_mpir_module = {
init,
finalize,
init_before_spawn,
orte_debugger_base_init_after_spawn
};
/* local globals */
static int init(void)
{
return ORTE_SUCCESS;
}
/**
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void finalize(void)
{
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
return;
}
opal_output_verbose(1, orte_debugger_base.output, "Info: Spawned by a debugger");
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
}

Просмотреть файл

@ -1,34 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef DEBUGGER_MPIR_H
#define DEBUGGER_MPIR_H
#include "orte_config.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_debugger_base_component_t mca_debugger_mpir_component;
extern orte_debugger_base_module_t orte_debugger_mpir_module;
END_C_DECLS
#endif /* ORTE_DEBUGGERS_H */

Просмотреть файл

@ -1,47 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "mpir.h"
static int component_query(mca_base_module_t **module, int *priority);
/*
* Struct of function pointers that need to be initialized
*/
orte_debugger_base_component_t mca_debugger_mpir_component = {
{
ORTE_DEBUGGER_BASE_VERSION_1_0_0,
"mpir", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
NULL,
NULL,
component_query /* module query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int component_query(mca_base_module_t **module, int *priority)
{
*priority = 100;
*module = (mca_base_module_t *)&orte_debugger_mpir_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,34 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
mpirx.h \
mpirx.c \
mpirx_component.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_debugger_mpirx_DSO
component_noinst =
component_install = mca_debugger_mpirx.la
else
component_noinst = libmca_debugger_mpirx.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_debugger_mpirx_la_SOURCES = $(sources)
mca_debugger_mpirx_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_debugger_mpirx_la_SOURCES =$(sources)
libmca_debugger_mpirx_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,19 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2011 Los Alamos National Security, LLC.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_debugger_mpirx_CONFIG([action-if-found], [action-if-not-found])
# -----------------------------------------------------------
AC_DEFUN([MCA_orte_debugger_mpirx_CONFIG], [
AC_CONFIG_FILES([orte/mca/debugger/mpirx/Makefile])
AS_IF([test "$orte_without_full_support" = 0],
[$1],
[$2])
])

Просмотреть файл

@ -1,366 +0,0 @@
/*
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#include <sys/stat.h>
#include <ctype.h>
#include <fcntl.h>
#include <errno.h>
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/path.h"
#include "opal/util/os_path.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_getcwd.h"
#include "opal/mca/event/event.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/mca/debugger/base/base.h"
#include "mpirx.h"
#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
/* Static API's */
static int init(void);
static void finalize(void);
static void init_before_spawn(orte_job_t *jdata);
/* Module definition */
orte_debugger_base_module_t orte_debugger_mpirx_module = {
init,
finalize,
init_before_spawn,
orte_debugger_base_init_after_spawn
};
/* local globals and functions */
static void attach_debugger(int fd, short event, void *arg);
static void build_debugger_args(orte_app_context_t *debugger);
static void open_fifo(void);
static opal_event_t attach;
static int attach_fd = -1;
static bool fifo_active=false;
static int init(void)
{
return ORTE_SUCCESS;
}
/**
* Release resources associated with data structures for running under
* a debugger using the MPICH/TotalView parallel debugger interface.
*/
void finalize(void)
{
if (fifo_active) {
opal_event_del(&attach);
close(attach_fd);
}
if (MPIR_proctable) {
free(MPIR_proctable);
MPIR_proctable = NULL;
}
}
/**
* Initialization of data structures for running under a debugger
* using an extended MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
void init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
int32_t ljob;
char *attach_fifo;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* if we were given a test debugger, then we still want to
* colaunch it
*/
if (NULL != orte_debugger_base.test_daemon) {
opal_output_verbose(2, orte_debugger_base.output,
"%s No debugger test daemon specified",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
goto launchit;
}
/* if we were given an auto-detect rate, then we want to setup
* an event so we periodically do the check
*/
if (0 < orte_debugger_mpirx_check_rate) {
opal_output_verbose(2, orte_debugger_base.output,
"%s Setting debugger attach check rate for %d seconds",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_debugger_mpirx_check_rate);
ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
} else {
/* create the attachment FIFO and put it into MPIR, setup readevent */
/* create a FIFO name in the session dir */
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
free(attach_fifo);
return;
}
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1);
free (attach_fifo);
open_fifo ();
}
return;
}
launchit:
opal_output_verbose(2, orte_debugger_base.output,
"%s: Spawned by a debugger",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
/* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
return;
}
opal_output_verbose(2, orte_debugger_base.output,
"%s Cospawning debugger daemons %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_base.test_daemon) ?
MPIR_executable_path : orte_debugger_base.test_daemon);
/* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(orte_debugger_daemon);
/* flag the job as being debugger daemons */
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_base.test_daemon) {
app->app = strdup(orte_debugger_base.test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, app);
orte_debugger_daemon->num_apps = 1;
}
return;
}
static void open_fifo (void)
{
if (attach_fd > 0) {
close(attach_fd);
}
attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
if (attach_fd < 0) {
opal_output(0, "%s unable to open debugger attach fifo",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
opal_output_verbose(2, orte_debugger_base.output,
"%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
opal_event_set(opal_event_base, &attach, attach_fd, OPAL_EV_READ, attach_debugger, NULL);
fifo_active = true;
opal_event_add(&attach, 0);
}
static void attach_debugger(int fd, short event, void *arg)
{
orte_app_context_t *app;
unsigned char fifo_cmd;
int rc;
int32_t ljob;
orte_job_t *jdata;
/* read the file descriptor to clear that event, if necessary */
if (fifo_active) {
opal_event_del(&attach);
fifo_active = false;
rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd));
if (!rc) {
/* reopen device to clear hangup */
open_fifo();
return;
}
if (1 != fifo_cmd) {
/* ignore the cmd */
goto RELEASE;
}
}
if (!MPIR_being_debugged && !orte_debugger_base.test_attach) {
/* false alarm */
goto RELEASE;
}
opal_output_verbose(1, orte_debugger_base.output,
"%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon);
/* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to
* check to see if we should spawn any daemons
*/
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
goto RELEASE;
}
opal_output_verbose(2, orte_debugger_base.output,
"%s Spawning debugger daemons %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_base.test_daemon) ?
MPIR_executable_path : orte_debugger_base.test_daemon);
/* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup
*/
jdata = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(jdata);
/* flag the job as being debugger daemons */
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_base.test_daemon) {
app->app = strdup(orte_debugger_base.test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
jdata->state = ORTE_JOB_STATE_INIT;
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(jdata->apps, app);
jdata->num_apps = 1;
/* setup the mapping policy to pernode so we get one
* daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->mapping = ORTE_MAPPING_PPR;
jdata->map->ppr = strdup("1:n");
/* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
}
}
RELEASE:
/* reset the read or timer event */
if (0 == orte_debugger_mpirx_check_rate) {
fifo_active = true;
opal_event_add(&attach, 0);
} else if (!MPIR_being_debugged) {
ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
}
/* notify the debugger that all is ready */
MPIR_Breakpoint();
}
static void build_debugger_args(orte_app_context_t *debugger)
{
int i, j;
char mpir_arg[MPIR_MAX_ARG_LENGTH];
if ('\0' != MPIR_server_arguments[0]) {
j=0;
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
if (MPIR_server_arguments[i] == '\0') {
if (0 < j) {
opal_argv_append_nosize(&debugger->argv, mpir_arg);
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
j=0;
}
} else {
mpir_arg[j] = MPIR_server_arguments[i];
j++;
}
}
}
}

Просмотреть файл

@ -1,26 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#ifndef DEBUGGER_MPIRX_H
#define DEBUGGER_MPIRX_H
#include "orte_config.h"
#include "orte/mca/debugger/debugger.h"
BEGIN_C_DECLS
ORTE_MODULE_DECLSPEC extern orte_debugger_base_component_t mca_debugger_mpirx_component;
extern int orte_debugger_mpirx_check_rate;
extern orte_debugger_base_module_t orte_debugger_mpirx_module;
END_C_DECLS
#endif

Просмотреть файл

@ -1,58 +0,0 @@
/* -*- C -*-
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "mpirx.h"
int orte_debugger_mpirx_check_rate=0;
static int component_open(void);
static int component_query(mca_base_module_t **module, int *priority);
/*
* Struct of function pointers that need to be initialized
*/
orte_debugger_base_component_t mca_debugger_mpirx_component = {
{
ORTE_DEBUGGER_BASE_VERSION_1_0_0,
"mpirx", /* MCA module name */
ORTE_MAJOR_VERSION, /* MCA module major version */
ORTE_MINOR_VERSION, /* MCA module minor version */
ORTE_RELEASE_VERSION, /* MCA module release version */
component_open,
NULL,
component_query /* module query */
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int component_open(void)
{
mca_base_component_t *c = &mca_debugger_mpirx_component.base_version;
mca_base_param_reg_int(c, "check_rate",
"Set rate (in secs) for auto-detect of debugger attachment (0 => do not check)",
false, false, 0, &orte_debugger_mpirx_check_rate);
return ORTE_SUCCESS;
}
static int component_query(mca_base_module_t **module, int *priority)
{
*priority = 10;
*module = (mca_base_module_t *)&orte_debugger_mpirx_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -41,7 +41,6 @@
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/notifier/notifier.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/ess/ess.h"
@ -526,9 +525,6 @@ static void default_hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
/* if debuggers are running, clean up */
orte_debugger.finalize();
/* set control params to indicate we are terminating */
orte_job_term_ordered = true;
orte_abnormal_term_ordered = true;

Просмотреть файл

@ -58,8 +58,6 @@
#include "orte/mca/notifier/base/base.h"
#include "orte/mca/sensor/base/base.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/rmaps/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
@ -622,18 +620,6 @@ static int rte_init(void)
/* start the local sensors */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* start the debuggers */
if (ORTE_SUCCESS != (ret = orte_debugger_base_open())) {
ORTE_ERROR_LOG(ret);
error = "orte_debugger_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_debugger_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_debugger_select";
goto error;
}
/* if a tool has launched us and is requesting event reports,
* then set its contact info into the comm system
*/
@ -701,9 +687,6 @@ static int rte_finalize(void)
signals_set = false;
}
/* stop the debuggers */
orte_debugger_base_close();
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
@ -1003,9 +986,6 @@ static void abort_exit_callback(int fd, short ign, void *arg)
* to terminate!
*/
if (!orte_never_launched) {
/* if the debuggers were run, clean up */
orte_debugger.finalize();
/*
* Turn off the process recovery functionality, if it was enabled.
* This keeps the errmgr from trying to recover from the shutdown

Просмотреть файл

@ -41,7 +41,6 @@
#include "opal/mca/hwloc/hwloc.h"
#include "orte/util/show_help.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/iof/iof.h"
@ -230,8 +229,6 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
rc = ORTE_ERR_BAD_PARAM;
goto WAKEUP;
}
/* setup for debugging */
orte_debugger.init_before_spawn(jdata);
}
/* setup the buffer */
@ -310,10 +307,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
ORTE_ERROR_LOG(rc);
goto WAKEUP;
}
/* complete debugger interface */
orte_debugger.init_after_spawn(jdata);
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:launch completed for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -41,7 +41,6 @@
#include "orte/types.h"
#include "orte/util/proc_info.h"
#include "orte/util/error_strings.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
@ -263,7 +262,7 @@ static void process_msg(int fd, short event, void *data)
job = jdata->jobid;
/* output debugger proctable, if requested */
if (orte_debugger_base.dump_proctable) {
if (orte_debugger_dump_proctable && !jdata->map->display_map) {
char *output;
opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
if (orte_xml_output) {

Просмотреть файл

@ -80,7 +80,6 @@
#include "orte/util/nidmap.h"
#include "orte/util/proc_info.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/ess/ess.h"

Просмотреть файл

@ -91,6 +91,11 @@ char **orte_fork_agent=NULL;
/* debugger job */
orte_job_t *orte_debugger_daemon=NULL;
bool orte_debugger_dump_proctable;
char *orte_debugger_test_daemon;
bool orte_debugger_test_attach;
bool orte_debugger_enable_fifo_attach;
int orte_debugger_check_rate;
/* exit flags */
int orte_exit_status = 0;

Просмотреть файл

@ -78,8 +78,6 @@ ORTE_DECLSPEC extern orte_process_name_t orte_name_invalid; /** instantiated in
/* define the name of my daemon */
#define ORTE_PROC_MY_DAEMON (&orte_process_info.my_daemon)
/* See comment in orte/tools/orterun/debuggers.c about this MCA
param */
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
/* error manager callback function */
@ -559,6 +557,11 @@ ORTE_DECLSPEC extern char **orte_fork_agent;
/* debugger job */
ORTE_DECLSPEC extern orte_job_t *orte_debugger_daemon;
ORTE_DECLSPEC extern bool orte_debugger_dump_proctable;
ORTE_DECLSPEC extern char *orte_debugger_test_daemon;
ORTE_DECLSPEC extern bool orte_debugger_test_attach;
ORTE_DECLSPEC extern bool orte_debugger_enable_fifo_attach;
ORTE_DECLSPEC extern int orte_debugger_check_rate;
/* exit flags */
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;

Просмотреть файл

@ -154,6 +154,33 @@ int orte_register_params(void)
true, false, 0, &value);
orte_in_parallel_debugger = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte",
"output_debugger_proctable",
"Whether or not to output the debugger proctable after launch (default: false)",
false, false, 0, &value);
orte_debugger_dump_proctable = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_string_name("orte", "debugger_test_daemon",
"Name of the executable to be used to simulate a debugger colaunch (relative or absolute path)",
false, false, NULL, &orte_debugger_test_daemon);
mca_base_param_reg_int_name("orte",
"debugger_test_attach",
"Test debugger colaunch after debugger attachment",
false, false, 0, &value);
orte_debugger_test_attach = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte",
"debugger_fifo_attach",
"Create a fifo to support debugger attachment",
false, false, 0, &value);
orte_debugger_enable_fifo_attach = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte",
"debugger_check_rate",
"Set rate (in secs) for auto-detect of debugger attachment (0 => do not check)",
false, false, 0, &orte_debugger_check_rate);
mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value);

Просмотреть файл

@ -49,7 +49,6 @@
#include "orte/mca/plm/plm.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
@ -128,9 +127,6 @@ void orte_jobs_complete(void)
}
}
/* if the debuggers were run, clean up */
orte_debugger.finalize();
if (0 < orte_routed.num_routes()) {
orte_plm.terminate_orteds();
}

Просмотреть файл

@ -69,8 +69,6 @@
#if !ORTE_DISABLE_FULL_SUPPORT
#include "orte/mca/notifier/notifier.h"
#include "orte/mca/notifier/base/base.h"
#include "orte/mca/debugger/debugger.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/oob/oob.h"
@ -373,14 +371,6 @@ void orte_info_open_components(void)
map->components = &orte_notifier_base_components_available;
opal_pointer_array_add(&component_map, map);
if (ORTE_SUCCESS != orte_debugger_base_open()) {
goto error;
}
map = OBJ_NEW(orte_info_component_map_t);
map->type = strdup("debugger");
map->components = &orte_debugger_base_components_available;
opal_pointer_array_add(&component_map, map);
if (ORTE_SUCCESS != mca_oob_base_open()) {
goto error;
}

Просмотреть файл

@ -205,7 +205,6 @@ int main(int argc, char *argv[])
opal_pointer_array_add(&mca_types, "event");
#if !ORTE_DISABLE_FULL_SUPPORT
opal_pointer_array_add(&mca_types, "debugger");
opal_pointer_array_add(&mca_types, "iof");
opal_pointer_array_add(&mca_types, "oob");
opal_pointer_array_add(&mca_types, "odls");

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2011 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -28,6 +28,12 @@
#include <string.h>
#endif
#include <stdio.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif /* HAVE_STRINGS_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
@ -46,6 +52,10 @@
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif /* HAVE_SYS_TIME_H */
#include <fcntl.h>
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#include "opal/mca/event/event.h"
#include "opal/mca/installdirs/installdirs.h"
@ -75,7 +85,6 @@
#include "orte/util/session_dir.h"
#include "orte/util/hnp_contact.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
@ -99,6 +108,37 @@
#include "orterun.h"
/* instance the standard MPIR interfaces */
#define MPIR_MAX_PATH_LENGTH 512
#define MPIR_MAX_ARG_LENGTH 1024
struct MPIR_PROCDESC *MPIR_proctable = NULL;
int MPIR_proctable_size = 0;
volatile int MPIR_being_debugged = 0;
volatile int MPIR_debug_state = 0;
int MPIR_i_am_starter = 0;
int MPIR_partial_attach_ok = 1;
char MPIR_executable_path[MPIR_MAX_PATH_LENGTH];
char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH];
volatile int MPIR_forward_output = 0;
volatile int MPIR_forward_comm = 0;
char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH];
int MPIR_force_to_main = 0;
static void orte_debugger_dump(void);
static void orte_debugger_init_before_spawn(orte_job_t *jdata);
static void orte_debugger_init_after_spawn(orte_job_t *jdata);
static void attach_debugger(int fd, short event, void *arg);
static void build_debugger_args(orte_app_context_t *debugger);
static void open_fifo (void);
ORTE_DECLSPEC void* MPIR_Breakpoint(void);
/*
* Breakpoint function for parallel debuggers
*/
void* MPIR_Breakpoint(void)
{
return NULL;
}
/*
* Globals
*/
@ -549,49 +589,6 @@ int orterun(int argc, char *argv[])
true);
}
/* force the debugger symbols to be included in orterun.
* this is required since the symbols are instantiated in
* the orte library, yet they need to be accessed
* prior to orte_init when a debugger wants to launch
* us
*/
if (NULL == MPIR_proctable) {
rc = ORTE_SUCCESS;
}
if (0 == MPIR_proctable_size) {
rc = ORTE_SUCCESS;
}
if (0 == MPIR_being_debugged) {
rc = ORTE_SUCCESS;
}
if (0 == MPIR_debug_state) {
rc = ORTE_SUCCESS;
}
if (0 == MPIR_i_am_starter) {
rc = ORTE_SUCCESS;
}
if (1 == MPIR_partial_attach_ok) {
rc = ORTE_SUCCESS;
}
if (NULL == MPIR_executable_path) {
rc = ORTE_SUCCESS;
}
if (NULL == MPIR_server_arguments) {
rc = ORTE_SUCCESS;
}
if (0 == MPIR_forward_output) {
rc = ORTE_SUCCESS;
}
if (0 == MPIR_forward_comm) {
rc = ORTE_SUCCESS;
}
MPIR_force_to_main = 0;
memset(MPIR_attach_fifo, 0, MPIR_MAX_PATH_LENGTH);
/* This function call simply ensures that all the symbols --
including MPIR_Breakpoint -- are pulled in via the linker from
orte/mca/debugger/base/debugger_base_fns.c. */
orte_debugger_base_pull_mpir_breakpoint();
/* Check for some "global" command line params */
parse_globals(argc, argv, &cmd_line);
OBJ_DESTRUCT(&cmd_line);
@ -848,9 +845,15 @@ int orterun(int argc, char *argv[])
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* setup for debugging */
orte_debugger_init_before_spawn(jdata);
/* spawn the job and its daemons */
rc = orte_plm.spawn(jdata);
/* complete debugger interface */
orte_debugger_init_after_spawn(jdata);
/* now wait until the termination event fires */
opal_event_dispatch(opal_event_base);
@ -2110,3 +2113,488 @@ static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
opal_argv_free(new_argv);
exit(1);
}
/**** DEBUGGER CODE ****/
/*
* Debugger support for orterun
*
* We interpret the MPICH debugger interface as follows:
*
* a) The launcher
* - spawns the other processes,
* - fills in the table MPIR_proctable, and sets MPIR_proctable_size
* - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1)
* - calls MPIR_Breakpoint() which the debugger will have a
* breakpoint on.
*
* b) Applications start and then spin until MPIR_debug_gate is set
* non-zero by the debugger.
*
* This file implements (a).
*
**************************************************************************
*
* Note that we have presently tested both TotalView and DDT parallel
* debuggers. They both nominally subscribe to the Etnus attaching
* interface, but there are differences between the two.
*
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
* TV launches mpirun. mpirun launches the application and then calls
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
* MPI job. TV then reads the proctable in mpirun and attaches itself
* to all the processes (it takes care of launching itself on the
* remote nodes). Upon attaching to all the MPI processes, the
* variable MPIR_being_debugged is set to 1. When it has finished
* attaching itself to all the MPI processes that it wants to,
* MPIR_Breakpoint() returns.
*
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
* X ddt-debugger" (not the lack of other arguments -- we can't pass
* anything to mpirun). This app will eventually fork/exec the MPI
* app. DDT does not current set MPIR_being_debugged in the MPI app.
*
**************************************************************************
*
* We support two ways of waiting for attaching debuggers. The
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
*
* 1. If using orterun: MPI processes will have the
* orte_in_parallel_debugger MCA param set to true (because not all
* debuggers consistently set MPIR_being_debugged in both the launcher
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
* then RML send a message to VPID 0 (MCW rank 0) when it returns
* (MPIR_Breakpoint() doesn't return until the debugger has attached
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
* the RML message. All other VPIDs immediately call the grpcomm
* barrier (and therefore block until the debugger attaches). Once
* VPID 0 receives the RML message, we know that the debugger has
* attached to all processes that it cares about, and VPID 0 then
* joins the grpcomm barrier, allowing the job to continue. This
* scheme has the side effect of nicely supporting partial attaches by
* parallel debuggers (i.e., attaching to only some of the MPI
* processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate. These environment variable names must be
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
*/
/* local globals and functions */
static void attach_debugger(int fd, short event, void *arg);
static void build_debugger_args(orte_app_context_t *debugger);
static void open_fifo(void);
static opal_event_t attach;
static int attach_fd = -1;
static bool fifo_active=false;
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
struct MPIR_PROCDESC {
char *host_name; /* something that can be passed to inet_addr */
char *executable_name; /* name of binary */
int pid; /* process pid */
};
static void orte_debugger_dump(void)
{
int i;
DUMP_INT(MPIR_being_debugged);
DUMP_INT(MPIR_debug_state);
DUMP_INT(MPIR_partial_attach_ok);
DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_forward_output);
DUMP_INT(MPIR_proctable_size);
fprintf(stderr, " MPIR_proctable:\n");
for (i = 0; i < MPIR_proctable_size; i++) {
fprintf(stderr,
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
i,
MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name,
MPIR_proctable[i].pid);
}
fprintf(stderr, "MPIR_executable_path: %s\n",
('\0' == MPIR_executable_path[0]) ?
"NULL" : (char*) MPIR_executable_path);
fprintf(stderr, "MPIR_server_arguments: %s\n",
('\0' == MPIR_server_arguments[0]) ?
"NULL" : (char*) MPIR_server_arguments);
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we are being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
static void orte_debugger_init_before_spawn(orte_job_t *jdata)
{
char *env_name;
orte_app_context_t *app;
int i;
int32_t ljob;
char *attach_fifo;
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* if we were given a test debugger, then we still want to
* colaunch it
*/
if (NULL != orte_debugger_test_daemon) {
opal_output_verbose(2, orte_debug_output,
"%s No debugger test daemon specified",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
goto launchit;
}
/* if we were given an auto-detect rate, then we want to setup
* an event so we periodically do the check
*/
if (0 < orte_debugger_check_rate) {
opal_output_verbose(2, orte_debug_output,
"%s Setting debugger attach check rate for %d seconds",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_debugger_check_rate);
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger);
} else if (orte_debugger_enable_fifo_attach) {
/* create the attachment FIFO and put it into MPIR, setup readevent */
/* create a FIFO name in the session dir */
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
free(attach_fifo);
return;
}
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1);
free(attach_fifo);
open_fifo();
}
return;
}
launchit:
opal_output_verbose(1, orte_debug_output, "Info: Spawned by a debugger");
/* tell the procs they are being debugged */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
opal_setenv(env_name, "1", true, &app->env);
}
free(env_name);
/* check if we need to co-spawn the debugger daemons */
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
return;
}
opal_output_verbose(2, orte_debug_output,
"%s Cospawning debugger daemons %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_test_daemon) ?
MPIR_executable_path : orte_debugger_test_daemon);
/* add debugger info to launch message */
orte_debugger_daemon = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(orte_debugger_daemon);
/* flag the job as being debugger daemons */
orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(orte_debugger_daemon->apps, app);
orte_debugger_daemon->num_apps = 1;
}
}
/*
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. This stage
* of initialization must occur after spawn
*
* NOTE: We -always- perform this step to ensure that any debugger
* that attaches to us post-launch of the application can get a
* completed proctable
*/
static void orte_debugger_init_after_spawn(orte_job_t *jdata)
{
orte_proc_t *proc;
orte_app_context_t *appctx;
orte_vpid_t i, j;
opal_buffer_t buf;
orte_process_name_t rank0;
int rc;
/* if we couldn't get thru the mapper stage, we might
* enter here with no procs. Avoid the "zero byte malloc"
* message by checking here
*/
if (MPIR_proctable || 0 == jdata->num_procs) {
/* already initialized */
opal_output_verbose(5, orte_debug_output,
"%s: debugger already initialized or zero procs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
/* fill in the proc table for the application processes */
opal_output_verbose(5, orte_debug_output,
"%s: Setting up debugger process table for applications",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
MPIR_debug_state = 1;
/* set the total number of processes in the job */
MPIR_proctable_size = jdata->num_procs;
/* allocate MPIR_proctable */
MPIR_proctable = (struct MPIR_PROCDESC *)malloc(sizeof(struct MPIR_PROCDESC) *
MPIR_proctable_size);
if (MPIR_proctable == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return;
}
if (orte_debugger_dump_proctable) {
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
}
/* initialize MPIR_proctable */
for (j=0; j < jdata->num_procs; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
/* store this data in the location whose index
* corresponds to the proc's rank
*/
i = proc->name.vpid;
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
continue;
}
MPIR_proctable[i].host_name = strdup(proc->node->name);
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->app, NULL );
} else {
MPIR_proctable[i].executable_name =
opal_os_path( false, appctx->cwd, appctx->app, NULL );
}
MPIR_proctable[i].pid = proc->pid;
if (orte_debugger_dump_proctable) {
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
}
}
if (0 < opal_output_get_verbosity(orte_debug_output)) {
orte_debugger_dump();
}
/* if we are being launched under a debugger, then we must wait
* for it to be ready to go and do some things to start the job
*/
if (MPIR_being_debugged) {
/* wait for all procs to have reported their contact info - this
* ensures that (a) they are all into mpi_init, and (b) the system
* has the contact info to successfully send a message to rank=0
*/
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
MPIR_Breakpoint();
/* send a message to rank=0 to release it */
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
rank0.jobid = jdata->jobid;
rank0.vpid = 0;
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
}
OBJ_DESTRUCT(&buf);
}
}
static void open_fifo (void)
{
if (attach_fd > 0) {
close(attach_fd);
}
attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
if (attach_fd < 0) {
opal_output(0, "%s unable to open debugger attach fifo",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
opal_output_verbose(2, orte_debug_output,
"%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
opal_event_set(opal_event_base, &attach, attach_fd, OPAL_EV_READ, attach_debugger, NULL);
fifo_active = true;
opal_event_add(&attach, 0);
}
static void attach_debugger(int fd, short event, void *arg)
{
orte_app_context_t *app;
unsigned char fifo_cmd;
int rc;
int32_t ljob;
orte_job_t *jdata;
/* read the file descriptor to clear that event, if necessary */
if (fifo_active) {
opal_event_del(&attach);
fifo_active = false;
rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd));
if (!rc) {
/* reopen device to clear hangup */
open_fifo();
return;
}
if (1 != fifo_cmd) {
/* ignore the cmd */
goto RELEASE;
}
}
if (!MPIR_being_debugged && !orte_debugger_test_attach) {
/* false alarm */
goto RELEASE;
}
opal_output_verbose(1, orte_debug_output,
"%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon);
/* a debugger has attached! All the MPIR_Proctable
* data is already available, so we only need to
* check to see if we should spawn any daemons
*/
if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) {
/* can only have one debugger */
if (NULL != orte_debugger_daemon) {
opal_output(0, "-------------------------------------------\n"
"Only one debugger can be used on a job.\n"
"-------------------------------------------\n");
goto RELEASE;
}
opal_output_verbose(2, orte_debug_output,
"%s Spawning debugger daemons %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == orte_debugger_test_daemon) ?
MPIR_executable_path : orte_debugger_test_daemon);
/* this will be launched just like a regular job,
* so we do not use the global orte_debugger_daemon
* as this is reserved for co-location upon startup
*/
jdata = OBJ_NEW(orte_job_t);
/* create a jobid for these daemons - this is done solely
* to avoid confusing the rest of the system's bookkeeping
*/
orte_plm_base_create_jobid(jdata);
/* flag the job as being debugger daemons */
jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
/* unless directed, we do not forward output */
if (!MPIR_forward_output) {
jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
}
/* add it to the global job pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* create an app_context for the debugger daemon */
app = OBJ_NEW(orte_app_context_t);
if (NULL != orte_debugger_test_daemon) {
app->app = strdup(orte_debugger_test_daemon);
} else {
app->app = strdup((char*)MPIR_executable_path);
}
jdata->state = ORTE_JOB_STATE_INIT;
opal_argv_append_nosize(&app->argv, app->app);
build_debugger_args(app);
opal_pointer_array_add(jdata->apps, app);
jdata->num_apps = 1;
/* setup the mapping policy to pernode so we get one
* daemon on each node
*/
jdata->map = OBJ_NEW(orte_job_map_t);
jdata->map->mapping = ORTE_MAPPING_PPR;
jdata->map->ppr = strdup("1:n");
/* now go ahead and spawn this job */
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
}
}
RELEASE:
/* reset the read or timer event */
if (0 == orte_debugger_check_rate) {
fifo_active = true;
opal_event_add(&attach, 0);
} else if (!MPIR_being_debugged) {
ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger);
}
/* notify the debugger that all is ready */
MPIR_Breakpoint();
}
static void build_debugger_args(orte_app_context_t *debugger)
{
int i, j;
char mpir_arg[MPIR_MAX_ARG_LENGTH];
if ('\0' != MPIR_server_arguments[0]) {
j=0;
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) {
if (MPIR_server_arguments[i] == '\0') {
if (0 < j) {
opal_argv_append_nosize(&debugger->argv, mpir_arg);
memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH);
j=0;
}
} else {
mpir_arg[j] = MPIR_server_arguments[i];
j++;
}
}
}
}