2c1a658232
This commit was SVN r23921.
189 строки
5.8 KiB
C
189 строки
5.8 KiB
C
/*
|
|
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif
|
|
#include <stdio.h>
|
|
#ifdef HAVE_STDLIB_H
|
|
#include <stdlib.h>
|
|
#endif /* HAVE_STDLIB_H */
|
|
#ifdef HAVE_STRINGS_H
|
|
#include <strings.h>
|
|
#endif /* HAVE_STRINGS_H */
|
|
#ifdef HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif /* HAVE_UNISTD_H */
|
|
#include <ctype.h>
|
|
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/os_path.h"
|
|
#include "opal/util/path.h"
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "orte/mca/debugger/base/base.h"
|
|
|
|
#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X);
|
|
|
|
#if !ORTE_DISABLE_FULL_SUPPORT
|
|
|
|
void orte_debugger_base_dump(void)
|
|
{
|
|
int i;
|
|
|
|
DUMP_INT(MPIR_being_debugged);
|
|
DUMP_INT(MPIR_debug_state);
|
|
DUMP_INT(MPIR_partial_attach_ok);
|
|
DUMP_INT(MPIR_i_am_starter);
|
|
DUMP_INT(MPIR_forward_output);
|
|
DUMP_INT(MPIR_proctable_size);
|
|
fprintf(stderr, " MPIR_proctable:\n");
|
|
for (i = 0; i < MPIR_proctable_size; i++) {
|
|
fprintf(stderr,
|
|
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
|
|
i,
|
|
MPIR_proctable[i].host_name,
|
|
MPIR_proctable[i].executable_name,
|
|
MPIR_proctable[i].pid);
|
|
}
|
|
fprintf(stderr, "MPIR_executable_path: %s\n",
|
|
('\0' == MPIR_executable_path[0]) ?
|
|
"NULL" : (char*) MPIR_executable_path);
|
|
fprintf(stderr, "MPIR_server_arguments: %s\n",
|
|
('\0' == MPIR_server_arguments[0]) ?
|
|
"NULL" : (char*) MPIR_server_arguments);
|
|
}
|
|
|
|
/*
|
|
* Initialization of data structures for running under a debugger
|
|
* using the MPICH/TotalView parallel debugger interface. This stage
|
|
* of initialization must occur after spawn
|
|
*
|
|
* NOTE: We -always- perform this step to ensure that any debugger
|
|
* that attaches to us post-launch of the application can get a
|
|
* completed proctable
|
|
*/
|
|
void orte_debugger_base_init_after_spawn(orte_job_t *jdata)
|
|
{
|
|
orte_proc_t *proc;
|
|
orte_app_context_t *appctx;
|
|
orte_vpid_t i, j;
|
|
opal_buffer_t buf;
|
|
orte_process_name_t rank0;
|
|
int rc;
|
|
|
|
if (MPIR_proctable) {
|
|
/* already initialized */
|
|
opal_output_verbose(5, orte_debugger_base.output,
|
|
"%s: debugger already initialized",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
return;
|
|
}
|
|
|
|
/* fill in the proc table for the application processes */
|
|
|
|
opal_output_verbose(5, orte_debugger_base.output,
|
|
"%s: Setting up debugger process table for applications",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
|
|
MPIR_debug_state = 1;
|
|
|
|
/* set the total number of processes in the job */
|
|
MPIR_proctable_size = jdata->num_procs;
|
|
|
|
/* allocate MPIR_proctable */
|
|
MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
|
|
MPIR_proctable_size);
|
|
if (MPIR_proctable == NULL) {
|
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
return;
|
|
}
|
|
|
|
if (orte_debugger_base.dump_proctable) {
|
|
opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid));
|
|
}
|
|
|
|
/* initialize MPIR_proctable */
|
|
for (j=0; j < jdata->num_procs; j++) {
|
|
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
|
|
continue;
|
|
}
|
|
/* store this data in the location whose index
|
|
* corresponds to the proc's rank
|
|
*/
|
|
i = proc->name.vpid;
|
|
if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
|
|
continue;
|
|
}
|
|
|
|
MPIR_proctable[i].host_name = strdup(proc->node->name);
|
|
if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
|
|
MPIR_proctable[i].executable_name =
|
|
opal_os_path( false, appctx->app, NULL );
|
|
} else {
|
|
MPIR_proctable[i].executable_name =
|
|
opal_os_path( false, appctx->cwd, appctx->app, NULL );
|
|
}
|
|
MPIR_proctable[i].pid = proc->pid;
|
|
if (orte_debugger_base.dump_proctable) {
|
|
opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d",
|
|
ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name,
|
|
MPIR_proctable[i].executable_name, MPIR_proctable[i].pid);
|
|
}
|
|
}
|
|
|
|
if (0 < opal_output_get_verbosity(orte_debugger_base.output)) {
|
|
orte_debugger_base_dump();
|
|
}
|
|
|
|
/* if we are being launched under a debugger, then we must wait
|
|
* for it to be ready to go and do some things to start the job
|
|
*/
|
|
if (MPIR_being_debugged) {
|
|
/* wait for all procs to have reported their contact info - this
|
|
* ensures that (a) they are all into mpi_init, and (b) the system
|
|
* has the contact info to successfully send a message to rank=0
|
|
*/
|
|
ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
|
|
|
|
(void) MPIR_Breakpoint();
|
|
|
|
/* send a message to rank=0 to release it */
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
|
|
rank0.jobid = jdata->jobid;
|
|
rank0.vpid = 0;
|
|
if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
|
|
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
|
|
}
|
|
OBJ_DESTRUCT(&buf);
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
* Breakpoint function for parallel debuggers
|
|
*/
|
|
void *MPIR_Breakpoint(void)
|
|
{
|
|
return NULL;
|
|
}
|