1
1

Basic totalview support for orterun. Close to working, but need to

check hostnames are obtained correctly.

This commit was SVN r7096.
Этот коммит содержится в:
David Daniel 2005-08-30 17:29:43 +00:00
родитель 00e0ff729d
Коммит ced11250e4
4 изменённых файлов: 74 добавлений и 21 удалений

Просмотреть файл

@ -26,7 +26,8 @@ libs = \
bin_PROGRAMS = orterun
orterun_SOURCES = \
orterun.c
orterun.c \
totalview.c
orterun_LDADD = $(libs)
orterun_DEPENDENCIES = $(libs)

Просмотреть файл

@ -355,6 +355,8 @@ int main(int argc, char *argv[])
signal_callback, NULL);
opal_event_add(&int_handler, NULL);
orte_totalview_init_before_spawn();
/* Spawn the job */
rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback);
@ -362,6 +364,7 @@ int main(int argc, char *argv[])
/* JMS show_help */
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
} else {
/* Wait for the app to complete */
if (wait_for_job_completion) {
@ -541,6 +544,11 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
ABORTED state and call the pls.terminate_job, which will result
in killing all the other processes. */
if (orte_debug_flag) {
opal_output(0, "spawn: in job_state_callback(jobid = %d, state = 0x%x)\n",
jobid, state);
}
switch(state) {
case ORTE_PROC_STATE_ABORTED:
dump_aborted_procs(jobid);
@ -552,6 +560,10 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
orterun_globals.exit = true;
opal_condition_signal(&orterun_globals.cond);
break;
case ORTE_PROC_STATE_AT_STG1:
orte_totalview_init_after_spawn(jobid);
break;
}
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
}

Просмотреть файл

@ -39,6 +39,12 @@
#include <stdlib.h>
#include <strings.h>
/*
* The environment
*/
extern char** environ;
/* +++ begin MPICH/TotalView interface definitions */
#define MPIR_DEBUG_SPAWNED 1
@ -67,9 +73,11 @@ void *MPIR_Breakpoint(void);
#include "opal/util/output.h"
#include "opal/class/opal_list.h"
#include "mca/base/base.h"
#include "mca/errmgr/errmgr.h"
#include "mca/rmgr/rmgr_types.h"
#include "mca/rmaps/base/rmaps_base_map.h"
#include "runtime/runtime.h"
/*
* NOTE: The job description in the registry will likely evolve to use
@ -106,20 +114,58 @@ static void dump(void)
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface.
* using the MPICH/TotalView parallel debugger interface. Before the
* spawn we need to check if we have being run under a TotalView-like
* debugger; if so then inform applications via an MCA parameter.
*/
int orte_totalview_init(orte_jobid_t jobid)
void orte_totalview_init_before_spawn(void)
{
if (MPIR_DEBUG_SPAWNED == MPIR_debug_state) {
int value;
char *s;
if (orte_debug_flag) {
opal_output(0, "Info: Spawned by a debugger");
}
if (mca_base_param_reg_int_name("orte", "mpi_wait_for_totalview",
"Whether the MPI application should wait for a debugger or not",
false, false, (int)false, &value) < 0) {
opal_output(0, "Error: mca_base_param_reg_int_name\n");
}
/* push mca parameter into the environment (not done automatically?) */
s = mca_base_param_environ_variable("orte", "mpi_wait_for_totalview", NULL);
if (ORTE_SUCCESS != opal_setenv(s, "1", true, &environ)) {
opal_output(0, "Error: Can't setenv %s\n", s);
}
free(s);
}
}
/**
* Initialization of data structures for running under a debugger
* using the MPICH/TotalView parallel debugger interface. This stage
* of initialization must occur after stage2 of spawn and is invoked
* via a callback.
*
* @param jobid The jobid returned by spawn.
*/
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
{
opal_list_t list_of_resource_maps;
opal_list_item_t *item;
int i;
int rc;
if (0) { /* debugging deamons <<-- needs work */
if (0) { /* debugging daemons <<-- needs work */
opal_output_verbose(10, 0,
"Info: Setting up debugger "
"process table for daemons\n");
if (orte_debug_flag) {
opal_output(0, "Info: Setting up debugger process table for daemons\n");
}
} else {
@ -130,9 +176,9 @@ int orte_totalview_init(orte_jobid_t jobid)
* processes in case someone attaches later.
*/
opal_output_verbose(10, 0,
"Info: Setting up debugger "
"process table for applications\n");
if (orte_debug_flag) {
opal_output(0, "Info: Setting up debugger process table for applications\n");
}
OBJ_CONSTRUCT(&list_of_resource_maps, opal_list_t);
@ -140,8 +186,8 @@ int orte_totalview_init(orte_jobid_t jobid)
rc = orte_rmaps_base_get_map(jobid, &list_of_resource_maps);
if (ORTE_SUCCESS != rc) {
opal_output(0, "Error: Can't get list of resource maps\n");
ORTE_ERROR_LOG(rc);
return rc;
}
/* find the total number of processes in the job */
@ -160,11 +206,8 @@ int orte_totalview_init(orte_jobid_t jobid)
if (MPIR_proctable == NULL) {
opal_output(0, "Error: Out of memory\n");
OBJ_DESTRUCT(&list_of_resource_maps);
return -1;
}
MPIR_being_debugged = 1;
/* initialize MPIR_proctable */
for (item = opal_list_get_first(&list_of_resource_maps);
@ -183,11 +226,7 @@ int orte_totalview_init(orte_jobid_t jobid)
}
if (1 /* some MCA parameter indicating spawned by debugger */) {
MPIR_debug_state = MPIR_DEBUG_SPAWNED;
}
if (1 /* verbose */) {
if (orte_debug_flag) {
dump();
}

Просмотреть файл

@ -19,7 +19,8 @@
#include "orte_config.h"
int orte_totalview_init(orte_jobid_t jobid);
void orte_totalview_init_before_spawn(void);
void orte_totalview_init_after_spawn(orte_jobid_t jobid);
void orte_totalview_finalize(void);
void *MPIR_Breakpoint(void);