Fixes trac:1361: mainly add new internal MCA parameter that orterun will
set when it launches under debuggers using the --debug option. This commit was SVN r19116. The following Trac tickets were found above: Ticket 1361 --> https://svn.open-mpi.org/trac/ompi/ticket/1361
Этот коммит содержится в:
родитель
b45d59ea2e
Коммит
4bdc093746
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco, Inc. All rights resereved.
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights resereved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -18,9 +18,12 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
/*
|
||||
* MPI portion of debugger support: initially based on the
|
||||
* TotalView/Etnus API for debuggers to attach to MPI jobs.
|
||||
*
|
||||
* There is a lengthy explanation of how OMPI handles parallel
|
||||
* debuggers attaching to MPI jobs in orte/tools/orterun/debuggers.c.
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
@ -67,6 +70,7 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#if defined(OMPI_MSGQ_DLL)
|
||||
/* This variable is old/deprecated -- the mpimsgq_dll_locations[]
|
||||
@ -76,7 +80,6 @@ OMPI_DECLSPEC char MPIR_dll_name[] = OMPI_MSGQ_DLL;
|
||||
OMPI_DECLSPEC char **mpidbg_dll_locations = NULL;
|
||||
OMPI_DECLSPEC char **mpimsgq_dll_locations = NULL;
|
||||
|
||||
OMPI_DECLSPEC int MPIR_being_debugged = 0;
|
||||
OMPI_DECLSPEC int MPIR_debug_typedefs_sizeof[] = {
|
||||
sizeof(short),
|
||||
sizeof(int),
|
||||
@ -109,7 +112,8 @@ OMPI_DECLSPEC ompi_group_t* ompi_group_t_type_inclusion = NULL;
|
||||
OMPI_DECLSPEC ompi_status_public_t* ompi_status_public_t_type_inclusion = NULL;
|
||||
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL;
|
||||
|
||||
OMPI_DECLSPEC volatile int MPIR_debug_gate=0;
|
||||
OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
|
||||
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
|
||||
|
||||
/* Check for a file in few direct ways for portability */
|
||||
static void check(char *dir, char *file, char **locations)
|
||||
@ -145,32 +149,8 @@ static void check(char *dir, char *file, char **locations)
|
||||
|
||||
/*
|
||||
* Wait for a debugger if asked. We support two ways of waiting for
|
||||
* attaching debuggers:
|
||||
*
|
||||
|
||||
* 1. If using orterun: MPI processes will have the
|
||||
* ompi_mpi_being_debugged MCA param set to true. The HNP will call
|
||||
* MPIR_Breakpoint() and then RML send a message to VPID 0 (MCW rank
|
||||
* 0) when it returns (MPIR_Breakpoint() doesn't return until the
|
||||
* debugger has attached to all relevant processes). Meanwhile, VPID
|
||||
* 0 blocks waiting for the RML message. All other VPIDs immediately
|
||||
* call the grpcomm barrier (and therefore block until the debugger
|
||||
* attaches). Once VPID 0 receives the RML message, we know that the
|
||||
* debugger has attached to all processes that it cares about, and
|
||||
* VPID 0 then joins the grpcomm barrier, allowing the job to
|
||||
* continue. This scheme has the side effect of nicely supporting
|
||||
* partial attaches by parallel debuggers (i.e., attaching to only
|
||||
* some of the MPI processes; not necessarily all of them).
|
||||
*
|
||||
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
|
||||
* will be true, and we know that there will not be an RML message
|
||||
* sent to VPID 0. So we have to look for a magic environment
|
||||
* variable from the launcher to know if the jobs will be attached by
|
||||
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
|
||||
* MPIR_debug_gate.
|
||||
*
|
||||
* Note that neither of these schemes use MPIR_being_debugged; it
|
||||
* doesn't seem useful to us. --> JMS this may change
|
||||
* attaching debuggers -- see big comment in
|
||||
* orte/tools/orterun/debuggers.c explaning the two scenarios.
|
||||
*/
|
||||
void ompi_wait_for_debugger(void)
|
||||
{
|
||||
@ -178,13 +158,9 @@ void ompi_wait_for_debugger(void)
|
||||
char *a, *b, **dirs;
|
||||
opal_buffer_t buf;
|
||||
|
||||
/* are we being debugged by a TotalView-like debugger? */
|
||||
mca_base_param_reg_int_name("ompi",
|
||||
"mpi_being_debugged",
|
||||
"Whether the MPI application "
|
||||
"is being debugged (default: false)",
|
||||
false, false, (int) false,
|
||||
&debugger);
|
||||
/* See lengthy comment in orte/tools/orterun/debuggers.c about
|
||||
orte_in_parallel_debugger */
|
||||
debugger = orte_in_parallel_debugger;
|
||||
|
||||
/* Add in environment variables for other launchers, such as yod,
|
||||
srun, ...etc. */
|
||||
@ -193,6 +169,9 @@ void ompi_wait_for_debugger(void)
|
||||
} else if (NULL != getenv("yod_you_are_being_debugged")) {
|
||||
debugger = 1;
|
||||
}
|
||||
if (1 == MPIR_being_debugged) {
|
||||
debugger = 1;
|
||||
}
|
||||
|
||||
if (!debugger) {
|
||||
/* if not, just return */
|
||||
@ -254,5 +233,3 @@ void ompi_wait_for_debugger(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -565,7 +565,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
error = "orte_grpcomm_modex failed";
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
||||
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
gettimeofday(&ompistop, NULL);
|
||||
opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",
|
||||
|
@ -85,6 +85,10 @@ opal_buffer_t *orte_tree_launch_cmd = NULL;
|
||||
opal_pointer_array_t *orte_job_data;
|
||||
opal_pointer_array_t *orte_node_pool;
|
||||
|
||||
/* See comment in orte/tools/orterun/debuggers.c about this MCA
|
||||
param */
|
||||
bool orte_in_parallel_debugger = false;
|
||||
|
||||
int orte_dt_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
@ -371,6 +371,10 @@ ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
|
||||
|
||||
/* See comment in orte/tools/orterun/debuggers.c about this MCA
|
||||
param */
|
||||
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -75,7 +75,16 @@ int orte_register_params(void)
|
||||
if (orte_debug_daemons_file_flag) {
|
||||
orte_debug_daemons_flag = true;
|
||||
}
|
||||
|
||||
|
||||
/* See comment in orte/tools/orterun/debuggers.c about this MCA
|
||||
param (this param is internal) */
|
||||
mca_base_param_reg_int_name("orte",
|
||||
"in_parallel_debugger",
|
||||
"Whether the application is being debugged "
|
||||
"in a parallel debugger (default: false)",
|
||||
true, false, 0, &value);
|
||||
orte_in_parallel_debugger = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "do_not_launch",
|
||||
"Perform all necessary operations to prepare to launch the application, but do not actually launch it",
|
||||
false, false, (int)false, &value);
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
* Copyright (c) 2007-2008 Cisco, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -39,6 +39,57 @@
|
||||
* non-zero by the debugger.
|
||||
*
|
||||
* This file implements (a).
|
||||
*
|
||||
**************************************************************************
|
||||
*
|
||||
* Note that we have presently tested both TotalView and DDT parallel
|
||||
* debuggers. They both nominally subscribe to the Etnus attaching
|
||||
* interface, but there are differences between the two.
|
||||
*
|
||||
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
|
||||
* TV launches mpirun. mpirun launches the application and then calls
|
||||
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
|
||||
* MPI job. TV then reads the proctable in mpirun and attaches itself
|
||||
* to all the processes (it takes care of launching itself on the
|
||||
* remote nodes). Upon attaching to all the MPI processes, the
|
||||
* variable MPIR_being_debugged is set to 1. When it has finished
|
||||
* attaching itself to all the MPI processes that it wants to,
|
||||
* MPIR_Breakpoint() returns.
|
||||
*
|
||||
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
|
||||
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
|
||||
* X ddt-debugger" (not the lack of other arguments -- we can't pass
|
||||
* anything to mpirun). This app will eventually fork/exec the MPI
|
||||
* app. DDT does not current set MPIR_being_debugged in the MPI app.
|
||||
*
|
||||
**************************************************************************
|
||||
*
|
||||
* We support two ways of waiting for attaching debuggers. The
|
||||
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
|
||||
*
|
||||
* 1. If using orterun: MPI processes will have the
|
||||
* orte_in_parallel_debugger MCA param set to true (because not all
|
||||
* debuggers consistently set MPIR_being_debugged in both the launcher
|
||||
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
|
||||
* then RML send a message to VPID 0 (MCW rank 0) when it returns
|
||||
* (MPIR_Breakpoint() doesn't return until the debugger has attached
|
||||
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
|
||||
* the RML message. All other VPIDs immediately call the grpcomm
|
||||
* barrier (and therefore block until the debugger attaches). Once
|
||||
* VPID 0 receives the RML message, we know that the debugger has
|
||||
* attached to all processes that it cares about, and VPID 0 then
|
||||
* joins the grpcomm barrier, allowing the job to continue. This
|
||||
* scheme has the side effect of nicely supporting partial attaches by
|
||||
* parallel debuggers (i.e., attaching to only some of the MPI
|
||||
* processes; not necessarily all of them).
|
||||
*
|
||||
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
|
||||
* will be true, and we know that there will not be an RML message
|
||||
* sent to VPID 0. So we have to look for a magic environment
|
||||
* variable from the launcher to know if the jobs will be attached by
|
||||
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
|
||||
* MPIR_debug_gate. These environment variable names must be
|
||||
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
@ -307,7 +358,7 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
{
|
||||
int i, id;
|
||||
char **new_argv = NULL;
|
||||
char *value, **lines;
|
||||
char *value, **lines, *env_name;
|
||||
|
||||
/* Get the orte_base_debug MCA parameter and search for a debugger
|
||||
that can run */
|
||||
@ -346,7 +397,18 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
opal_argv_free(lines);
|
||||
|
||||
/* We found one */
|
||||
/* Set an MCA param so that everyone knows that they are being
|
||||
launched under a debugger; not all debuggers are consistent
|
||||
about setting MPIR_being_debugged in both the launcher and the
|
||||
MPI processes */
|
||||
env_name = mca_base_param_environ_variable("orte",
|
||||
"in_parallel_debugger", NULL);
|
||||
if (NULL != env_name) {
|
||||
opal_setenv(env_name, "1", true, &environ);
|
||||
free(env_name);
|
||||
}
|
||||
|
||||
/* Launch the debugger */
|
||||
execvp(new_argv[0], new_argv);
|
||||
value = opal_argv_join(new_argv, ' ');
|
||||
orte_show_help("help-orterun.txt", "debugger-exec-failed",
|
||||
@ -365,11 +427,11 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
|
||||
*/
|
||||
void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
{
|
||||
char *s;
|
||||
char *env_name;
|
||||
orte_app_context_t **apps;
|
||||
orte_std_cntr_t i;
|
||||
|
||||
if (!MPIR_being_debugged) {
|
||||
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
|
||||
/* not being debugged */
|
||||
return;
|
||||
}
|
||||
@ -378,14 +440,15 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
opal_output(0, "Info: Spawned by a debugger");
|
||||
}
|
||||
|
||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||
/* tell the procs they are being debugged */
|
||||
s = mca_base_param_environ_variable("ompi", "mpi_being_debugged", NULL);
|
||||
apps = (orte_app_context_t**)jdata->apps->addr;
|
||||
env_name = mca_base_param_environ_variable("orte",
|
||||
"in_parallel_debugger", NULL);
|
||||
|
||||
for (i=0; i < jdata->num_apps; i++) {
|
||||
opal_setenv(s, "1", true, &apps[i]->env);
|
||||
opal_setenv(env_name, "1", true, &apps[i]->env);
|
||||
}
|
||||
free(s);
|
||||
free(env_name);
|
||||
}
|
||||
|
||||
|
||||
@ -406,7 +469,7 @@ void orte_debugger_init_after_spawn(orte_job_t *jdata)
|
||||
opal_buffer_t buf;
|
||||
orte_process_name_t rank0;
|
||||
int rc;
|
||||
|
||||
|
||||
if (MPIR_proctable) {
|
||||
/* already initialized */
|
||||
return;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user