1
1

Basic totalview support for orterun. Close to working, but need to

check hostnames are obtained correctly.

This commit was SVN r7096.
Этот коммит содержится в:
David Daniel 2005-08-30 17:29:43 +00:00
родитель 00e0ff729d
Коммит ced11250e4
4 изменённых файлов: 74 добавлений и 21 удалений

Просмотреть файл

@ -26,7 +26,8 @@ libs = \
bin_PROGRAMS = orterun bin_PROGRAMS = orterun
orterun_SOURCES = \ orterun_SOURCES = \
orterun.c orterun.c \
totalview.c
orterun_LDADD = $(libs) orterun_LDADD = $(libs)
orterun_DEPENDENCIES = $(libs) orterun_DEPENDENCIES = $(libs)

Просмотреть файл

@ -355,6 +355,8 @@ int main(int argc, char *argv[])
signal_callback, NULL); signal_callback, NULL);
opal_event_add(&int_handler, NULL); opal_event_add(&int_handler, NULL);
orte_totalview_init_before_spawn();
/* Spawn the job */ /* Spawn the job */
rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback); rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback);
@ -362,6 +364,7 @@ int main(int argc, char *argv[])
/* JMS show_help */ /* JMS show_help */
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc); opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
} else { } else {
/* Wait for the app to complete */ /* Wait for the app to complete */
if (wait_for_job_completion) { if (wait_for_job_completion) {
@ -541,6 +544,11 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
ABORTED state and call the pls.terminate_job, which will result ABORTED state and call the pls.terminate_job, which will result
in killing all the other processes. */ in killing all the other processes. */
if (orte_debug_flag) {
opal_output(0, "spawn: in job_state_callback(jobid = %d, state = 0x%x)\n",
jobid, state);
}
switch(state) { switch(state) {
case ORTE_PROC_STATE_ABORTED: case ORTE_PROC_STATE_ABORTED:
dump_aborted_procs(jobid); dump_aborted_procs(jobid);
@ -552,6 +560,10 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
orterun_globals.exit = true; orterun_globals.exit = true;
opal_condition_signal(&orterun_globals.cond); opal_condition_signal(&orterun_globals.cond);
break; break;
case ORTE_PROC_STATE_AT_STG1:
orte_totalview_init_after_spawn(jobid);
break;
} }
OPAL_THREAD_UNLOCK(&orterun_globals.lock); OPAL_THREAD_UNLOCK(&orterun_globals.lock);
} }

Просмотреть файл

@ -39,6 +39,12 @@
#include <stdlib.h> #include <stdlib.h>
#include <strings.h> #include <strings.h>
/*
* The environment
*/
extern char** environ;
/* +++ begin MPICH/TotalView interface definitions */ /* +++ begin MPICH/TotalView interface definitions */
#define MPIR_DEBUG_SPAWNED 1 #define MPIR_DEBUG_SPAWNED 1
@ -67,9 +73,11 @@ void *MPIR_Breakpoint(void);
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "mca/base/base.h"
#include "mca/errmgr/errmgr.h" #include "mca/errmgr/errmgr.h"
#include "mca/rmgr/rmgr_types.h" #include "mca/rmgr/rmgr_types.h"
#include "mca/rmaps/base/rmaps_base_map.h" #include "mca/rmaps/base/rmaps_base_map.h"
#include "runtime/runtime.h"
/* /*
* NOTE: The job description in the registry will likely evolve to use * NOTE: The job description in the registry will likely evolve to use
@ -92,7 +100,7 @@ static void dump(void)
DUMP_INT(MPIR_acquired_pre_main); DUMP_INT(MPIR_acquired_pre_main);
DUMP_INT(MPIR_i_am_starter); DUMP_INT(MPIR_i_am_starter);
DUMP_INT(MPIR_proctable_size); DUMP_INT(MPIR_proctable_size);
fprintf(stderr, "MPIR_proctable:\n"); fprintf(stderr, " MPIR_proctable:\n");
for (i = 0; i < MPIR_proctable_size; i++) { for (i = 0; i < MPIR_proctable_size; i++) {
fprintf(stderr, fprintf(stderr,
" (i, host, exe, pid) = (%d, %s, %s, %d)\n", " (i, host, exe, pid) = (%d, %s, %s, %d)\n",
@ -106,20 +114,58 @@ static void dump(void)
/** /**
* Initialization of data structures for running under a debugger * Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. * using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we have being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/ */
int orte_totalview_init(orte_jobid_t jobid) void orte_totalview_init_before_spawn(void)
{
if (MPIR_DEBUG_SPAWNED == MPIR_debug_state) {
int value;
char *s;
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
if (mca_base_param_reg_int_name("orte", "mpi_wait_for_totalview",
"Whether the MPI application should wait for a debugger or not",
false, false, (int)false, &value) < 0) {
opal_output(0, "Error: mca_base_param_reg_int_name\n");
}
/* push mca parameter into the environment (not done automatically?) */
s = mca_base_param_environ_variable("orte", "mpi_wait_for_totalview", NULL);
if (ORTE_SUCCESS != opal_setenv(s, "1", true, &environ)) {
opal_output(0, "Error: Can't setenv %s\n", s);
}
free(s);
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. This stage
* of initialization must occur after stage2 of spawn and is invoked
* via a callback.
*
* @param jobid The jobid returned by spawn.
*/
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
{ {
opal_list_t list_of_resource_maps; opal_list_t list_of_resource_maps;
opal_list_item_t *item; opal_list_item_t *item;
int i; int i;
int rc; int rc;
if (0) { /* debugging deamons <<-- needs work */ if (0) { /* debugging daemons <<-- needs work */
opal_output_verbose(10, 0, if (orte_debug_flag) {
"Info: Setting up debugger " opal_output(0, "Info: Setting up debugger process table for daemons\n");
"process table for daemons\n"); }
} else { } else {
@ -130,9 +176,9 @@ int orte_totalview_init(orte_jobid_t jobid)
* processes in case someone attaches later. * processes in case someone attaches later.
*/ */
opal_output_verbose(10, 0, if (orte_debug_flag) {
"Info: Setting up debugger " opal_output(0, "Info: Setting up debugger process table for applications\n");
"process table for applications\n"); }
OBJ_CONSTRUCT(&list_of_resource_maps, opal_list_t); OBJ_CONSTRUCT(&list_of_resource_maps, opal_list_t);
@ -140,8 +186,8 @@ int orte_totalview_init(orte_jobid_t jobid)
rc = orte_rmaps_base_get_map(jobid, &list_of_resource_maps); rc = orte_rmaps_base_get_map(jobid, &list_of_resource_maps);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
opal_output(0, "Error: Can't get list of resource maps\n");
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc;
} }
/* find the total number of processes in the job */ /* find the total number of processes in the job */
@ -160,11 +206,8 @@ int orte_totalview_init(orte_jobid_t jobid)
if (MPIR_proctable == NULL) { if (MPIR_proctable == NULL) {
opal_output(0, "Error: Out of memory\n"); opal_output(0, "Error: Out of memory\n");
OBJ_DESTRUCT(&list_of_resource_maps); OBJ_DESTRUCT(&list_of_resource_maps);
return -1;
} }
MPIR_being_debugged = 1;
/* initialize MPIR_proctable */ /* initialize MPIR_proctable */
for (item = opal_list_get_first(&list_of_resource_maps); for (item = opal_list_get_first(&list_of_resource_maps);
@ -183,11 +226,7 @@ int orte_totalview_init(orte_jobid_t jobid)
} }
if (1 /* some MCA parameter indicating spawned by debugger */) { if (orte_debug_flag) {
MPIR_debug_state = MPIR_DEBUG_SPAWNED;
}
if (1 /* verbose */) {
dump(); dump();
} }

Просмотреть файл

@ -19,7 +19,8 @@
#include "orte_config.h" #include "orte_config.h"
int orte_totalview_init(orte_jobid_t jobid); void orte_totalview_init_before_spawn(void);
void orte_totalview_init_after_spawn(orte_jobid_t jobid);
void orte_totalview_finalize(void); void orte_totalview_finalize(void);
void *MPIR_Breakpoint(void); void *MPIR_Breakpoint(void);