1
1
openmpi/ompi/debuggers/ompi_debuggers.c
Jeff Squyres 5fd742e769 Add in the standardized way to notify a debugger if the MPI job is
about to abort.  Fixes trac:1509.

This commit was SVN r19596.

The following Trac tickets were found above:
  Ticket 1509 --> https://svn.open-mpi.org/trac/ompi/ticket/1509
2008-09-20 11:34:37 +00:00

277 строки
8.8 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights resereved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* MPI portion of debugger support: initially based on the
* TotalView/Etnus API for debuggers to attach to MPI jobs.
*
* There is a lengthy explanation of how OMPI handles parallel
* debuggers attaching to MPI jobs in orte/tools/orterun/debuggers.c.
*/
#include "ompi_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "opal/mca/base/base.h"
#include "opal/util/argv.h"
#include "opal/mca/installdirs/installdirs.h"
#include "debuggers.h"
/**
* A lot of include files that are required by al optimized builds in order
* to get access to the type information. Beware, this file have to always
* be compiled with the -g flag, otherwise the type information will be
* missing and the parallel debuggers will be unable to initialize the
* Open MPI debug library.
*/
#include "opal/class/opal_list.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/base/pml_base_request.h"
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "ompi/mca/pml/base/pml_base_recvreq.h"
#include "opal/class/opal_pointer_array.h"
#include "ompi/communicator/communicator.h"
#include "ompi/group/group.h"
#include "ompi/datatype/datatype.h"
#include "ompi/include/mpi.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/runtime/orte_globals.h"
#if defined(OMPI_MSGQ_DLL)
/* This variable is old/deprecated -- the mpimsgq_dll_locations[]
method is preferred because it's more flexible */
OMPI_DECLSPEC char MPIR_dll_name[] = OMPI_MSGQ_DLL;
#endif /* defined(OMPI_MSGQ_DLL) */
OMPI_DECLSPEC char **mpidbg_dll_locations = NULL;
OMPI_DECLSPEC char **mpimsgq_dll_locations = NULL;
OMPI_DECLSPEC int MPIR_debug_typedefs_sizeof[] = {
sizeof(short),
sizeof(int),
sizeof(long),
sizeof(long long),
sizeof(void*),
sizeof(bool),
sizeof(size_t)
};
/*
* Values defined by the standardized interface; do not change these
* values
*/
#define MPIR_DEBUG_SPAWNED 1
#define MPIR_DEBUG_ABORTING 2
/**
* There is an issue with the debugger running on different architectures
* compared with the debugged program. We need to know the sizes of the types
* on the debugged program. The problem is that the size depend on the
* compilation options (32 or 64 bits) and on the compiler. Therefore,
* the simplest and more accurate way is to export these sizes directly from
* the debugged program.
*/
OMPI_DECLSPEC opal_list_item_t* opal_list_item_t_type_inclusion = NULL;
OMPI_DECLSPEC opal_list_t* opal_list_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_free_list_item_t* ompi_free_list_item_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_free_list_t* ompi_free_list_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_request_t* ompi_request_t_type_inclusion = NULL;
OMPI_DECLSPEC mca_pml_base_request_t* mca_pml_base_request_t_type_inclusion = NULL;
OMPI_DECLSPEC mca_pml_base_send_request_t* mca_pml_base_send_request_t_type_inclusion = NULL;
OMPI_DECLSPEC mca_pml_base_recv_request_t* mca_pml_base_recv_request_t_type_inclusion = NULL;
OMPI_DECLSPEC opal_pointer_array_t* opal_pointer_array_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_communicator_t* ompi_communicator_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_group_t* ompi_group_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_status_public_t* ompi_status_public_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL;
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
OMPI_DECLSPEC volatile int MPIR_debug_state = 0;
OMPI_DECLSPEC char *MPIR_debug_abort_string = "";
/* Check for a file in few direct ways for portability */
static void check(char *dir, char *file, char **locations)
{
char *str;
asprintf(&str, "%s/%s.so", dir, file);
#if defined(HAVE_SYS_STAT_H)
{
struct stat buf;
/* Use stat() */
if (0 == stat(str, &buf)) {
opal_argv_append_nosize(&locations, file);
}
}
#else
{
FILE *fp;
/* Just try to open the file */
if (NULL != (fp = fopen(str, "r"))) {
fclose(fp);
opal_argv_append_nosize(&locations, file);
}
}
#endif /* defined(HAVE_SYS_STAT_H) */
free(str);
}
/*
* Wait for a debugger if asked. We support two ways of waiting for
* attaching debuggers -- see big comment in
* orte/tools/orterun/debuggers.c explaning the two scenarios.
*/
void ompi_wait_for_debugger(void)
{
int i, debugger, rc;
char *a, *b, **dirs;
opal_buffer_t buf;
/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
#if ORTE_DISABLE_FULL_SUPPORT
debugger = 0;
#else
debugger = orte_in_parallel_debugger;
#endif
/* Add in environment variables for other launchers, such as yod,
srun, ...etc. */
if (1 == MPIR_being_debugged) {
debugger = 1;
} else if (NULL != getenv("yod_you_are_being_debugged")) {
debugger = 1;
}
if (1 == MPIR_being_debugged) {
debugger = 1;
}
if (!debugger) {
/* if not, just return */
return;
}
/* if we are being debugged, then we need to find
* the correct plug-ins
*/
a = strdup(opal_install_dirs.pkglibdir);
mca_base_param_reg_string_name("ompi",
"debugger_dll_path",
"List of directories where MPI_INIT should search for debugger plugins",
false, false, a, &b);
free(a);
/* Search the directory for MPI debugger DLLs */
if (NULL != b) {
dirs = opal_argv_split(b, ':');
for (i = 0; dirs[i] != NULL; ++i) {
check(dirs[i], OMPI_MPIHANDLES_DLL_PREFIX, mpidbg_dll_locations);
check(dirs[i], OMPI_MSGQ_DLL_PREFIX, mpimsgq_dll_locations);
}
}
if (ORTE_DISABLE_FULL_SUPPORT) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
#if defined(__WINDOWS__)
Sleep(100); /* milliseconds */
#elif defined(HAVE_USLEEP)
usleep(100000); /* microseconds */
#else
sleep(1); /* seconds */
#endif
}
} else {
/* only the rank=0 proc waits for either a message from the
* HNP or for the debugger to attach - everyone else will just
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
* joins them.
*/
if (0 != ORTE_PROC_MY_NAME->vpid) {
return;
}
/* VPID 0 waits for a message from the HNP */
OBJ_CONSTRUCT(&buf, opal_buffer_t);
rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf,
ORTE_RML_TAG_DEBUGGER_RELEASE, 0);
OBJ_DESTRUCT(&buf); /* don't care about contents of message */
if (rc < 0) {
/* if it failed for some reason, then we are in trouble -
* for now, just report the problem and give up waiting
*/
opal_output(0, "Debugger_attach[rank=%ld]: could not wait for debugger - error %s!",
(long)ORTE_PROC_MY_NAME->vpid, ORTE_ERROR_NAME(rc));
}
}
}
/*
* Breakpoint function for parallel debuggers. This function is also
* defined in orterun for the starter. It should never conflict with
* this one, but we'll make it static, just to be sure.
*/
static void *MPIR_Breakpoint(void)
{
return NULL;
}
/*
* Tell the debugger that we are about to abort
*/
void ompi_debugger_notify_abort(char *reason)
{
MPIR_debug_state = MPIR_DEBUG_ABORTING;
if (NULL != reason && strlen(reason) > 0) {
MPIR_debug_abort_string = reason;
} else {
MPIR_debug_abort_string = "Unknown";
}
/* Now tell the debugger */
MPIR_Breakpoint();
}