Basic totalview support for orterun. Close to working, but need to
check hostnames are obtained correctly. This commit was SVN r7096.
Этот коммит содержится в:
родитель
00e0ff729d
Коммит
ced11250e4
@ -26,7 +26,8 @@ libs = \
|
||||
bin_PROGRAMS = orterun
|
||||
|
||||
orterun_SOURCES = \
|
||||
orterun.c
|
||||
orterun.c \
|
||||
totalview.c
|
||||
|
||||
orterun_LDADD = $(libs)
|
||||
orterun_DEPENDENCIES = $(libs)
|
||||
|
@ -355,6 +355,8 @@ int main(int argc, char *argv[])
|
||||
signal_callback, NULL);
|
||||
opal_event_add(&int_handler, NULL);
|
||||
|
||||
orte_totalview_init_before_spawn();
|
||||
|
||||
/* Spawn the job */
|
||||
|
||||
rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback);
|
||||
@ -362,6 +364,7 @@ int main(int argc, char *argv[])
|
||||
/* JMS show_help */
|
||||
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
|
||||
} else {
|
||||
|
||||
/* Wait for the app to complete */
|
||||
|
||||
if (wait_for_job_completion) {
|
||||
@ -541,6 +544,11 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
|
||||
ABORTED state and call the pls.terminate_job, which will result
|
||||
in killing all the other processes. */
|
||||
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "spawn: in job_state_callback(jobid = %d, state = 0x%x)\n",
|
||||
jobid, state);
|
||||
}
|
||||
|
||||
switch(state) {
|
||||
case ORTE_PROC_STATE_ABORTED:
|
||||
dump_aborted_procs(jobid);
|
||||
@ -552,6 +560,10 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
|
||||
orterun_globals.exit = true;
|
||||
opal_condition_signal(&orterun_globals.cond);
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_AT_STG1:
|
||||
orte_totalview_init_after_spawn(jobid);
|
||||
break;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
||||
}
|
||||
|
@ -39,6 +39,12 @@
|
||||
#include <stdlib.h>
|
||||
#include <strings.h>
|
||||
|
||||
/*
|
||||
* The environment
|
||||
*/
|
||||
extern char** environ;
|
||||
|
||||
|
||||
/* +++ begin MPICH/TotalView interface definitions */
|
||||
|
||||
#define MPIR_DEBUG_SPAWNED 1
|
||||
@ -67,9 +73,11 @@ void *MPIR_Breakpoint(void);
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "mca/base/base.h"
|
||||
#include "mca/errmgr/errmgr.h"
|
||||
#include "mca/rmgr/rmgr_types.h"
|
||||
#include "mca/rmaps/base/rmaps_base_map.h"
|
||||
#include "runtime/runtime.h"
|
||||
|
||||
/*
|
||||
* NOTE: The job description in the registry will likely evolve to use
|
||||
@ -106,20 +114,58 @@ static void dump(void)
|
||||
|
||||
/**
|
||||
* Initialization of data structures for running under a debugger
|
||||
* using the MPICH/TotalView parallel debugger interface.
|
||||
* using the MPICH/TotalView parallel debugger interface. Before the
|
||||
* spawn we need to check if we have being run under a TotalView-like
|
||||
* debugger; if so then inform applications via an MCA parameter.
|
||||
*/
|
||||
int orte_totalview_init(orte_jobid_t jobid)
|
||||
void orte_totalview_init_before_spawn(void)
|
||||
{
|
||||
if (MPIR_DEBUG_SPAWNED == MPIR_debug_state) {
|
||||
|
||||
int value;
|
||||
char *s;
|
||||
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "Info: Spawned by a debugger");
|
||||
}
|
||||
|
||||
if (mca_base_param_reg_int_name("orte", "mpi_wait_for_totalview",
|
||||
"Whether the MPI application should wait for a debugger or not",
|
||||
false, false, (int)false, &value) < 0) {
|
||||
opal_output(0, "Error: mca_base_param_reg_int_name\n");
|
||||
}
|
||||
|
||||
/* push mca parameter into the environment (not done automatically?) */
|
||||
|
||||
s = mca_base_param_environ_variable("orte", "mpi_wait_for_totalview", NULL);
|
||||
if (ORTE_SUCCESS != opal_setenv(s, "1", true, &environ)) {
|
||||
opal_output(0, "Error: Can't setenv %s\n", s);
|
||||
}
|
||||
free(s);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Initialization of data structures for running under a debugger
|
||||
* using the MPICH/TotalView parallel debugger interface. This stage
|
||||
* of initialization must occur after stage2 of spawn and is invoked
|
||||
* via a callback.
|
||||
*
|
||||
* @param jobid The jobid returned by spawn.
|
||||
*/
|
||||
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
|
||||
{
|
||||
opal_list_t list_of_resource_maps;
|
||||
opal_list_item_t *item;
|
||||
int i;
|
||||
int rc;
|
||||
|
||||
if (0) { /* debugging deamons <<-- needs work */
|
||||
if (0) { /* debugging daemons <<-- needs work */
|
||||
|
||||
opal_output_verbose(10, 0,
|
||||
"Info: Setting up debugger "
|
||||
"process table for daemons\n");
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "Info: Setting up debugger process table for daemons\n");
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
@ -130,9 +176,9 @@ int orte_totalview_init(orte_jobid_t jobid)
|
||||
* processes in case someone attaches later.
|
||||
*/
|
||||
|
||||
opal_output_verbose(10, 0,
|
||||
"Info: Setting up debugger "
|
||||
"process table for applications\n");
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "Info: Setting up debugger process table for applications\n");
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&list_of_resource_maps, opal_list_t);
|
||||
|
||||
@ -140,8 +186,8 @@ int orte_totalview_init(orte_jobid_t jobid)
|
||||
|
||||
rc = orte_rmaps_base_get_map(jobid, &list_of_resource_maps);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "Error: Can't get list of resource maps\n");
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* find the total number of processes in the job */
|
||||
@ -160,11 +206,8 @@ int orte_totalview_init(orte_jobid_t jobid)
|
||||
if (MPIR_proctable == NULL) {
|
||||
opal_output(0, "Error: Out of memory\n");
|
||||
OBJ_DESTRUCT(&list_of_resource_maps);
|
||||
return -1;
|
||||
}
|
||||
|
||||
MPIR_being_debugged = 1;
|
||||
|
||||
/* initialize MPIR_proctable */
|
||||
|
||||
for (item = opal_list_get_first(&list_of_resource_maps);
|
||||
@ -183,11 +226,7 @@ int orte_totalview_init(orte_jobid_t jobid)
|
||||
|
||||
}
|
||||
|
||||
if (1 /* some MCA parameter indicating spawned by debugger */) {
|
||||
MPIR_debug_state = MPIR_DEBUG_SPAWNED;
|
||||
}
|
||||
|
||||
if (1 /* verbose */) {
|
||||
if (orte_debug_flag) {
|
||||
dump();
|
||||
}
|
||||
|
||||
|
@ -19,7 +19,8 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
int orte_totalview_init(orte_jobid_t jobid);
|
||||
void orte_totalview_init_before_spawn(void);
|
||||
void orte_totalview_init_after_spawn(orte_jobid_t jobid);
|
||||
void orte_totalview_finalize(void);
|
||||
void *MPIR_Breakpoint(void);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user