1
1

Fixes trac:1361: mainly add new internal MCA parameter that orterun will

set when it launches under debuggers using the --debug option.

This commit was SVN r19116.

The following Trac tickets were found above:
  Ticket 1361 --> https://svn.open-mpi.org/trac/ompi/ticket/1361
Этот коммит содержится в:
Jeff Squyres 2008-07-31 22:11:46 +00:00
родитель b45d59ea2e
Коммит 4bdc093746
6 изменённых файлов: 107 добавлений и 50 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights resereved. * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights resereved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -18,9 +18,12 @@
* $HEADER$ * $HEADER$
*/ */
/** /*
* MPI portion of debugger support: initially based on the * MPI portion of debugger support: initially based on the
* TotalView/Etnus API for debuggers to attach to MPI jobs. * TotalView/Etnus API for debuggers to attach to MPI jobs.
*
* There is a lengthy explanation of how OMPI handles parallel
* debuggers attaching to MPI jobs in orte/tools/orterun/debuggers.c.
*/ */
#include "ompi_config.h" #include "ompi_config.h"
@ -67,6 +70,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/runtime/orte_globals.h"
#if defined(OMPI_MSGQ_DLL) #if defined(OMPI_MSGQ_DLL)
/* This variable is old/deprecated -- the mpimsgq_dll_locations[] /* This variable is old/deprecated -- the mpimsgq_dll_locations[]
@ -76,7 +80,6 @@ OMPI_DECLSPEC char MPIR_dll_name[] = OMPI_MSGQ_DLL;
OMPI_DECLSPEC char **mpidbg_dll_locations = NULL; OMPI_DECLSPEC char **mpidbg_dll_locations = NULL;
OMPI_DECLSPEC char **mpimsgq_dll_locations = NULL; OMPI_DECLSPEC char **mpimsgq_dll_locations = NULL;
OMPI_DECLSPEC int MPIR_being_debugged = 0;
OMPI_DECLSPEC int MPIR_debug_typedefs_sizeof[] = { OMPI_DECLSPEC int MPIR_debug_typedefs_sizeof[] = {
sizeof(short), sizeof(short),
sizeof(int), sizeof(int),
@ -109,7 +112,8 @@ OMPI_DECLSPEC ompi_group_t* ompi_group_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_status_public_t* ompi_status_public_t_type_inclusion = NULL; OMPI_DECLSPEC ompi_status_public_t* ompi_status_public_t_type_inclusion = NULL;
OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL; OMPI_DECLSPEC ompi_datatype_t* ompi_datatype_t_type_inclusion = NULL;
OMPI_DECLSPEC volatile int MPIR_debug_gate=0; OMPI_DECLSPEC volatile int MPIR_debug_gate = 0;
OMPI_DECLSPEC volatile int MPIR_being_debugged = 0;
/* Check for a file in few direct ways for portability */ /* Check for a file in few direct ways for portability */
static void check(char *dir, char *file, char **locations) static void check(char *dir, char *file, char **locations)
@ -145,32 +149,8 @@ static void check(char *dir, char *file, char **locations)
/* /*
* Wait for a debugger if asked. We support two ways of waiting for * Wait for a debugger if asked. We support two ways of waiting for
* attaching debuggers: * attaching debuggers -- see big comment in
* * orte/tools/orterun/debuggers.c explaning the two scenarios.
* 1. If using orterun: MPI processes will have the
* ompi_mpi_being_debugged MCA param set to true. The HNP will call
* MPIR_Breakpoint() and then RML send a message to VPID 0 (MCW rank
* 0) when it returns (MPIR_Breakpoint() doesn't return until the
* debugger has attached to all relevant processes). Meanwhile, VPID
* 0 blocks waiting for the RML message. All other VPIDs immediately
* call the grpcomm barrier (and therefore block until the debugger
* attaches). Once VPID 0 receives the RML message, we know that the
* debugger has attached to all processes that it cares about, and
* VPID 0 then joins the grpcomm barrier, allowing the job to
* continue. This scheme has the side effect of nicely supporting
* partial attaches by parallel debuggers (i.e., attaching to only
* some of the MPI processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate.
*
* Note that neither of these schemes use MPIR_being_debugged; it
* doesn't seem useful to us. --> JMS this may change
*/ */
void ompi_wait_for_debugger(void) void ompi_wait_for_debugger(void)
{ {
@ -178,13 +158,9 @@ void ompi_wait_for_debugger(void)
char *a, *b, **dirs; char *a, *b, **dirs;
opal_buffer_t buf; opal_buffer_t buf;
/* are we being debugged by a TotalView-like debugger? */ /* See lengthy comment in orte/tools/orterun/debuggers.c about
mca_base_param_reg_int_name("ompi", orte_in_parallel_debugger */
"mpi_being_debugged", debugger = orte_in_parallel_debugger;
"Whether the MPI application "
"is being debugged (default: false)",
false, false, (int) false,
&debugger);
/* Add in environment variables for other launchers, such as yod, /* Add in environment variables for other launchers, such as yod,
srun, ...etc. */ srun, ...etc. */
@ -193,6 +169,9 @@ void ompi_wait_for_debugger(void)
} else if (NULL != getenv("yod_you_are_being_debugged")) { } else if (NULL != getenv("yod_you_are_being_debugged")) {
debugger = 1; debugger = 1;
} }
if (1 == MPIR_being_debugged) {
debugger = 1;
}
if (!debugger) { if (!debugger) {
/* if not, just return */ /* if not, just return */
@ -254,5 +233,3 @@ void ompi_wait_for_debugger(void)
} }
} }
} }

Просмотреть файл

@ -565,7 +565,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
error = "orte_grpcomm_modex failed"; error = "orte_grpcomm_modex failed";
goto error; goto error;
} }
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
gettimeofday(&ompistop, NULL); gettimeofday(&ompistop, NULL);
opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec", opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",

Просмотреть файл

@ -85,6 +85,10 @@ opal_buffer_t *orte_tree_launch_cmd = NULL;
opal_pointer_array_t *orte_job_data; opal_pointer_array_t *orte_job_data;
opal_pointer_array_t *orte_node_pool; opal_pointer_array_t *orte_node_pool;
/* See comment in orte/tools/orterun/debuggers.c about this MCA
param */
bool orte_in_parallel_debugger = false;
int orte_dt_init(void) int orte_dt_init(void)
{ {
int rc; int rc;

Просмотреть файл

@ -371,6 +371,10 @@ ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data; ORTE_DECLSPEC extern opal_pointer_array_t *orte_job_data;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool; ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_pool;
/* See comment in orte/tools/orterun/debuggers.c about this MCA
param */
ORTE_DECLSPEC extern bool orte_in_parallel_debugger;
#endif /* ORTE_DISABLE_FULL_SUPPORT */ #endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS END_C_DECLS

Просмотреть файл

@ -75,7 +75,16 @@ int orte_register_params(void)
if (orte_debug_daemons_file_flag) { if (orte_debug_daemons_file_flag) {
orte_debug_daemons_flag = true; orte_debug_daemons_flag = true;
} }
/* See comment in orte/tools/orterun/debuggers.c about this MCA
param (this param is internal) */
mca_base_param_reg_int_name("orte",
"in_parallel_debugger",
"Whether the application is being debugged "
"in a parallel debugger (default: false)",
true, false, 0, &value);
orte_in_parallel_debugger = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "do_not_launch", mca_base_param_reg_int_name("orte", "do_not_launch",
"Perform all necessary operations to prepare to launch the application, but do not actually launch it", "Perform all necessary operations to prepare to launch the application, but do not actually launch it",
false, false, (int)false, &value); false, false, (int)false, &value);

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights * Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved. * Copyright (c) 2007-2008 Cisco, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -39,6 +39,57 @@
* non-zero by the debugger. * non-zero by the debugger.
* *
* This file implements (a). * This file implements (a).
*
**************************************************************************
*
* Note that we have presently tested both TotalView and DDT parallel
* debuggers. They both nominally subscribe to the Etnus attaching
* interface, but there are differences between the two.
*
* TotalView: user launches "totalview mpirun -a ...<mpirun args>...".
* TV launches mpirun. mpirun launches the application and then calls
* MPIR_Breakpoint(). This is the signal to TV that it's a parallel
* MPI job. TV then reads the proctable in mpirun and attaches itself
* to all the processes (it takes care of launching itself on the
* remote nodes). Upon attaching to all the MPI processes, the
* variable MPIR_being_debugged is set to 1. When it has finished
* attaching itself to all the MPI processes that it wants to,
* MPIR_Breakpoint() returns.
*
* DDT: user launches "ddt bin -np X <mpi app name>". DDT fork/exec's
* mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np
* X ddt-debugger" (not the lack of other arguments -- we can't pass
* anything to mpirun). This app will eventually fork/exec the MPI
* app. DDT does not current set MPIR_being_debugged in the MPI app.
*
**************************************************************************
*
* We support two ways of waiting for attaching debuggers. The
* implementation spans this file and ompi/debuggers/ompi_debuggers.c.
*
* 1. If using orterun: MPI processes will have the
* orte_in_parallel_debugger MCA param set to true (because not all
* debuggers consistently set MPIR_being_debugged in both the launcher
* and in the MPI procs). The HNP will call MPIR_Breakpoint() and
* then RML send a message to VPID 0 (MCW rank 0) when it returns
* (MPIR_Breakpoint() doesn't return until the debugger has attached
* to all relevant processes). Meanwhile, VPID 0 blocks waiting for
* the RML message. All other VPIDs immediately call the grpcomm
* barrier (and therefore block until the debugger attaches). Once
* VPID 0 receives the RML message, we know that the debugger has
* attached to all processes that it cares about, and VPID 0 then
* joins the grpcomm barrier, allowing the job to continue. This
* scheme has the side effect of nicely supporting partial attaches by
* parallel debuggers (i.e., attaching to only some of the MPI
* processes; not necessarily all of them).
*
* 2. If not using orterun: in this case, ORTE_DISABLE_FULL_SUPPORT
* will be true, and we know that there will not be an RML message
* sent to VPID 0. So we have to look for a magic environment
* variable from the launcher to know if the jobs will be attached by
* a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on
* MPIR_debug_gate. These environment variable names must be
* hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c).
*/ */
#include <stdio.h> #include <stdio.h>
@ -307,7 +358,7 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
{ {
int i, id; int i, id;
char **new_argv = NULL; char **new_argv = NULL;
char *value, **lines; char *value, **lines, *env_name;
/* Get the orte_base_debug MCA parameter and search for a debugger /* Get the orte_base_debug MCA parameter and search for a debugger
that can run */ that can run */
@ -346,7 +397,18 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
opal_argv_free(lines); opal_argv_free(lines);
/* We found one */ /* We found one */
/* Set an MCA param so that everyone knows that they are being
launched under a debugger; not all debuggers are consistent
about setting MPIR_being_debugged in both the launcher and the
MPI processes */
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
if (NULL != env_name) {
opal_setenv(env_name, "1", true, &environ);
free(env_name);
}
/* Launch the debugger */
execvp(new_argv[0], new_argv); execvp(new_argv[0], new_argv);
value = opal_argv_join(new_argv, ' '); value = opal_argv_join(new_argv, ' ');
orte_show_help("help-orterun.txt", "debugger-exec-failed", orte_show_help("help-orterun.txt", "debugger-exec-failed",
@ -365,11 +427,11 @@ void orte_run_debugger(char *basename, opal_cmd_line_t *cmd_line,
*/ */
void orte_debugger_init_before_spawn(orte_job_t *jdata) void orte_debugger_init_before_spawn(orte_job_t *jdata)
{ {
char *s; char *env_name;
orte_app_context_t **apps; orte_app_context_t **apps;
orte_std_cntr_t i; orte_std_cntr_t i;
if (!MPIR_being_debugged) { if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
/* not being debugged */ /* not being debugged */
return; return;
} }
@ -378,14 +440,15 @@ void orte_debugger_init_before_spawn(orte_job_t *jdata)
opal_output(0, "Info: Spawned by a debugger"); opal_output(0, "Info: Spawned by a debugger");
} }
apps = (orte_app_context_t**)jdata->apps->addr;
/* tell the procs they are being debugged */ /* tell the procs they are being debugged */
s = mca_base_param_environ_variable("ompi", "mpi_being_debugged", NULL); apps = (orte_app_context_t**)jdata->apps->addr;
env_name = mca_base_param_environ_variable("orte",
"in_parallel_debugger", NULL);
for (i=0; i < jdata->num_apps; i++) { for (i=0; i < jdata->num_apps; i++) {
opal_setenv(s, "1", true, &apps[i]->env); opal_setenv(env_name, "1", true, &apps[i]->env);
} }
free(s); free(env_name);
} }
@ -406,7 +469,7 @@ void orte_debugger_init_after_spawn(orte_job_t *jdata)
opal_buffer_t buf; opal_buffer_t buf;
orte_process_name_t rank0; orte_process_name_t rank0;
int rc; int rc;
if (MPIR_proctable) { if (MPIR_proctable) {
/* already initialized */ /* already initialized */
return; return;