1
1

Fixes trac:1361: mainly add new internal MCA parameter that orterun will

set when it launches under debuggers using the --debug option.

This commit was SVN r19116.

The following Trac tickets were found above:
  Ticket 1361 --> https://svn.open-mpi.org/trac/ompi/ticket/1361
Этот коммит содержится в:
Jeff Squyres 2008-07-31 22:11:46 +00:00
родитель b45d59ea2e
Коммит 4bdc093746
6 изменённых файлов: 107 добавлений и 50 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights resereved.
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights resereved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -18,9 +18,12 @@
* $HEADER$
*/
/**
/*
* MPI portion of debugger support: initially based on the
* TotalView/Etnus API for debuggers to attach to MPI jobs.
*
* There is a lengthy explanation of how OMPI handles parallel
* debuggers attaching to MPI jobs in orte/tools/orterun/debuggers.c.
*/
#include "ompi_config.h"
@ -67,6 +70,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/runtime/orte_globals.h"
#if defined(OMPI_MSGQ_DLL)
/* This variable is old/deprecated -- the mpimsgq_dll_locations[]
@ -76,7 +80,6 @@ OMPI_DECLSPEC char MPIR_dll_name[] = OMPI_MSGQ_DLL;
OMPI_DECLSPEC char **mpidbg_dll_locations = NULL;
OMPI_DECLSPEC char **mpimsgq_dll_locations = NULL;
OMPI_DECLSPEC int MPIR_being_debugged = 0;
OMPI_DECLSPEC int MPIR_debug_typedefs_sizeof[] = {
sizeof(short),
sizeof(int),
@ -109,7 +112,8 @@ OMPI_DECLSPEC ompi_group_t* ompi_group_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_status_public_t* ompi_status_public_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL;
OMPI_DECLSPEC volatile int MPIR_debug_gate=0;
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
/* Check for a file in few direct ways for portability */
static void check(char *dir, char *file, char **locations)
@ -145,32 +149,8 @@ static void check(char *dir, char *file, char **locations)
/*
* Wait for a debugger if asked. We support two ways of waiting for
* attaching debuggers:
*
* 1. If using orterun: MPI processes will have the
* ompi_mpi_being_debugged MCA param set to true. The HNP will call
* MPIR_Breakpoint() and then RML send a message to VPID 0 (MCW rank
* 0) when it returns (MPIR_Breakpoint() doesn't return until the
* debugger has attached to all relevant processes). Meanwhile, VPID
* 0 blocks waiting for the RML message. All other VPIDs immediately
* call the grpcomm barrier (and therefore block until the debugger
* attaches). Once VPID 0 receives the RML message, we know that the
* debugger has attached to all processes that it cares about, and
* VPID 0 then joins the grpcomm barrier, allowing the job to
* continue. This scheme has the side effect of nicely supporting
* partial attaches by parallel debuggers (i.e., attaching to only
* some of the MPI processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate.
*
* Note that neither of these schemes use MPIR_being_debugged; it
* doesn't seem useful to us. --> JMS this may change
* attaching debuggers -- see big comment in
* orte/tools/orterun/debuggers.c explaning the two scenarios.
*/
void ompi_wait_for_debugger(void)
{
@ -178,13 +158,9 @@ void ompi_wait_for_debugger(void)
char *a, *b, **dirs;
opal_buffer_t buf;
/* are we being debugged by a TotalView-like debugger? */
mca_base_param_reg_int_name("ompi",
"mpi_being_debugged",
"Whether the MPI application "
"is being debugged (default: false)",
false, false, (int) false,
&debugger);
/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
debugger = orte_in_parallel_debugger;
/* Add in environment variables for other launchers, such as yod,
srun, ...etc. */
@ -193,6 +169,9 @@ void ompi_wait_for_debugger(void)
} else if (NULL != getenv("yod_you_are_being_debugged")) {
debugger = 1;
}
if (1 == MPIR_being_debugged) {
debugger = 1;
}
if (!debugger) {
/* if not, just return */
@ -254,5 +233,3 @@ void ompi_wait_for_debugger(void)
}
}
}

Просмотреть файл

@ -565,7 +565,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
error = "orte_grpcomm_modex failed";
goto error;
}
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",

Просмотреть файл

@ -85,6 +85,10 @@ opal_buffer_t *orte_tree_launch_cmd = NULL;
opal_pointer_array_t *orte_job_data;
opal_pointer_array_t *orte_node_pool;
/* See comment in orte/tools/orterun/debuggers.c about this MCA
param */
bool orte_in_parallel_debugger = false;
int orte_dt_init(void)
{
int rc;

Просмотреть файл

@ -371,6 +371,10 @@ ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
/* See comment in orte/tools/orterun/debuggers.c about this MCA
param */
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -75,7 +75,16 @@ int orte_register_params(void)
if (orte_debug_daemons_file_flag) {
orte_debug_daemons_flag = true;
}
/* See comment in orte/tools/orterun/debuggers.c about this MCA
param (this param is internal) */
mca_base_param_reg_int_name("orte",
"in_parallel_debugger",
"Whether the application is being debugged "
"in a parallel debugger (default: false)",
true, false, 0, &value);
orte_in_parallel_debugger = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value);

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* Copyright (c) 2007-2008 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -39,6 +39,57 @@
* non-zero by the debugger.
*
* This file implements (a).
*
**************************************************************************
*
* Note that we have presently tested both TotalView and DDT parallel
* debuggers. They both nominally subscribe to the Etnus attaching
* interface, but there are differences between the two.
*
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
* TV launches mpirun. mpirun launches the application and then calls
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
* MPI job. TV then reads the proctable in mpirun and attaches itself
* to all the processes (it takes care of launching itself on the
* remote nodes). Upon attaching to all the MPI processes, the
* variable MPIR_being_debugged is set to 1. When it has finished
* attaching itself to all the MPI processes that it wants to,
* MPIR_Breakpoint() returns.
*
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
* X ddt-debugger" (not the lack of other arguments -- we can't pass
* anything to mpirun). This app will eventually fork/exec the MPI
* app. DDT does not current set MPIR_being_debugged in the MPI app.
*
**************************************************************************
*
* We support two ways of waiting for attaching debuggers. The
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
*
* 1. If using orterun: MPI processes will have the
* orte_in_parallel_debugger MCA param set to true (because not all
* debuggers consistently set MPIR_being_debugged in both the launcher
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
* then RML send a message to VPID 0 (MCW rank 0) when it returns
* (MPIR_Breakpoint() doesn't return until the debugger has attached
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
* the RML message. All other VPIDs immediately call the grpcomm
* barrier (and therefore block until the debugger attaches). Once
* VPID 0 receives the RML message, we know that the debugger has
* attached to all processes that it cares about, and VPID 0 then
* joins the grpcomm barrier, allowing the job to continue. This
* scheme has the side effect of nicely supporting partial attaches by
* parallel debuggers (i.e., attaching to only some of the MPI
* processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate. These environment variable names must be
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
*/
#include <stdio.h>
@ -307,7 +358,7 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
{
int i, id;
char **new_argv = NULL;
char *value, **lines;
char *value, **lines, *env_name;
/* Get the orte_base_debug MCA parameter and search for a debugger
that can run */
@ -346,7 +397,18 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
opal_argv_free(lines);
/* We found one */
/* Set an MCA param so that everyone knows that they are being
launched under a debugger; not all debuggers are consistent
about setting MPIR_being_debugged in both the launcher and the
MPI processes */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
if (NULL != env_name) {
opal_setenv(env_name, "1", true, &environ);
free(env_name);
}
/* Launch the debugger */
execvp(new_argv[0], new_argv);
value = opal_argv_join(new_argv, ' ');
orte_show_help("help-orterun.txt", "debugger-exec-failed",
@ -365,11 +427,11 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
*/
void orte_debugger_init_before_spawn(orte_job_t *jdata)
{
char *s;
char *env_name;
orte_app_context_t **apps;
orte_std_cntr_t i;
if (!MPIR_being_debugged) {
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* not being debugged */
return;
}
@ -378,14 +440,15 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
opal_output(0, "Info: Spawned by a debugger");
}
apps = (orte_app_context_t**)jdata->apps->addr;
/* tell the procs they are being debugged */
s = mca_base_param_environ_variable("ompi", "mpi_being_debugged", NULL);
apps = (orte_app_context_t**)jdata->apps->addr;
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->num_apps; i++) {
opal_setenv(s, "1", true, &apps[i]->env);
opal_setenv(env_name, "1", true, &apps[i]->env);
}
free(s);
free(env_name);
}
@ -406,7 +469,7 @@ void orte_debugger_init_after_spawn(orte_job_t *jdata)
opal_buffer_t buf;
orte_process_name_t rank0;
int rc;
if (MPIR_proctable) {
/* already initialized */
return;