Basic totalview support for orterun. Close to working, but need to
check hostnames are obtained correctly. This commit was SVN r7096.
Этот коммит содержится в:
родитель
00e0ff729d
Коммит
ced11250e4
@ -26,7 +26,8 @@ libs = \
|
|||||||
bin_PROGRAMS = orterun
|
bin_PROGRAMS = orterun
|
||||||
|
|
||||||
orterun_SOURCES = \
|
orterun_SOURCES = \
|
||||||
orterun.c
|
orterun.c \
|
||||||
|
totalview.c
|
||||||
|
|
||||||
orterun_LDADD = $(libs)
|
orterun_LDADD = $(libs)
|
||||||
orterun_DEPENDENCIES = $(libs)
|
orterun_DEPENDENCIES = $(libs)
|
||||||
|
@ -355,6 +355,8 @@ int main(int argc, char *argv[])
|
|||||||
signal_callback, NULL);
|
signal_callback, NULL);
|
||||||
opal_event_add(&int_handler, NULL);
|
opal_event_add(&int_handler, NULL);
|
||||||
|
|
||||||
|
orte_totalview_init_before_spawn();
|
||||||
|
|
||||||
/* Spawn the job */
|
/* Spawn the job */
|
||||||
|
|
||||||
rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback);
|
rc = orte_rmgr.spawn(apps, num_apps, &jobid, job_state_callback);
|
||||||
@ -362,6 +364,7 @@ int main(int argc, char *argv[])
|
|||||||
/* JMS show_help */
|
/* JMS show_help */
|
||||||
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
|
opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
/* Wait for the app to complete */
|
/* Wait for the app to complete */
|
||||||
|
|
||||||
if (wait_for_job_completion) {
|
if (wait_for_job_completion) {
|
||||||
@ -541,6 +544,11 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
|
|||||||
ABORTED state and call the pls.terminate_job, which will result
|
ABORTED state and call the pls.terminate_job, which will result
|
||||||
in killing all the other processes. */
|
in killing all the other processes. */
|
||||||
|
|
||||||
|
if (orte_debug_flag) {
|
||||||
|
opal_output(0, "spawn: in job_state_callback(jobid = %d, state = 0x%x)\n",
|
||||||
|
jobid, state);
|
||||||
|
}
|
||||||
|
|
||||||
switch(state) {
|
switch(state) {
|
||||||
case ORTE_PROC_STATE_ABORTED:
|
case ORTE_PROC_STATE_ABORTED:
|
||||||
dump_aborted_procs(jobid);
|
dump_aborted_procs(jobid);
|
||||||
@ -552,6 +560,10 @@ static void job_state_callback(orte_jobid_t jobid, orte_proc_state_t state)
|
|||||||
orterun_globals.exit = true;
|
orterun_globals.exit = true;
|
||||||
opal_condition_signal(&orterun_globals.cond);
|
opal_condition_signal(&orterun_globals.cond);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_AT_STG1:
|
||||||
|
orte_totalview_init_after_spawn(jobid);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
OPAL_THREAD_UNLOCK(&orterun_globals.lock);
|
||||||
}
|
}
|
||||||
|
@ -39,6 +39,12 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <strings.h>
|
#include <strings.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The environment
|
||||||
|
*/
|
||||||
|
extern char** environ;
|
||||||
|
|
||||||
|
|
||||||
/* +++ begin MPICH/TotalView interface definitions */
|
/* +++ begin MPICH/TotalView interface definitions */
|
||||||
|
|
||||||
#define MPIR_DEBUG_SPAWNED 1
|
#define MPIR_DEBUG_SPAWNED 1
|
||||||
@ -67,9 +73,11 @@ void *MPIR_Breakpoint(void);
|
|||||||
|
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/class/opal_list.h"
|
#include "opal/class/opal_list.h"
|
||||||
|
#include "mca/base/base.h"
|
||||||
#include "mca/errmgr/errmgr.h"
|
#include "mca/errmgr/errmgr.h"
|
||||||
#include "mca/rmgr/rmgr_types.h"
|
#include "mca/rmgr/rmgr_types.h"
|
||||||
#include "mca/rmaps/base/rmaps_base_map.h"
|
#include "mca/rmaps/base/rmaps_base_map.h"
|
||||||
|
#include "runtime/runtime.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NOTE: The job description in the registry will likely evolve to use
|
* NOTE: The job description in the registry will likely evolve to use
|
||||||
@ -92,7 +100,7 @@ static void dump(void)
|
|||||||
DUMP_INT(MPIR_acquired_pre_main);
|
DUMP_INT(MPIR_acquired_pre_main);
|
||||||
DUMP_INT(MPIR_i_am_starter);
|
DUMP_INT(MPIR_i_am_starter);
|
||||||
DUMP_INT(MPIR_proctable_size);
|
DUMP_INT(MPIR_proctable_size);
|
||||||
fprintf(stderr, "MPIR_proctable:\n");
|
fprintf(stderr, " MPIR_proctable:\n");
|
||||||
for (i = 0; i < MPIR_proctable_size; i++) {
|
for (i = 0; i < MPIR_proctable_size; i++) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
|
" (i, host, exe, pid) = (%d, %s, %s, %d)\n",
|
||||||
@ -106,20 +114,58 @@ static void dump(void)
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialization of data structures for running under a debugger
|
* Initialization of data structures for running under a debugger
|
||||||
* using the MPICH/TotalView parallel debugger interface.
|
* using the MPICH/TotalView parallel debugger interface. Before the
|
||||||
|
* spawn we need to check if we have being run under a TotalView-like
|
||||||
|
* debugger; if so then inform applications via an MCA parameter.
|
||||||
*/
|
*/
|
||||||
int orte_totalview_init(orte_jobid_t jobid)
|
void orte_totalview_init_before_spawn(void)
|
||||||
|
{
|
||||||
|
if (MPIR_DEBUG_SPAWNED == MPIR_debug_state) {
|
||||||
|
|
||||||
|
int value;
|
||||||
|
char *s;
|
||||||
|
|
||||||
|
if (orte_debug_flag) {
|
||||||
|
opal_output(0, "Info: Spawned by a debugger");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mca_base_param_reg_int_name("orte", "mpi_wait_for_totalview",
|
||||||
|
"Whether the MPI application should wait for a debugger or not",
|
||||||
|
false, false, (int)false, &value) < 0) {
|
||||||
|
opal_output(0, "Error: mca_base_param_reg_int_name\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* push mca parameter into the environment (not done automatically?) */
|
||||||
|
|
||||||
|
s = mca_base_param_environ_variable("orte", "mpi_wait_for_totalview", NULL);
|
||||||
|
if (ORTE_SUCCESS != opal_setenv(s, "1", true, &environ)) {
|
||||||
|
opal_output(0, "Error: Can't setenv %s\n", s);
|
||||||
|
}
|
||||||
|
free(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialization of data structures for running under a debugger
|
||||||
|
* using the MPICH/TotalView parallel debugger interface. This stage
|
||||||
|
* of initialization must occur after stage2 of spawn and is invoked
|
||||||
|
* via a callback.
|
||||||
|
*
|
||||||
|
* @param jobid The jobid returned by spawn.
|
||||||
|
*/
|
||||||
|
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
|
||||||
{
|
{
|
||||||
opal_list_t list_of_resource_maps;
|
opal_list_t list_of_resource_maps;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
int i;
|
int i;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
if (0) { /* debugging deamons <<-- needs work */
|
if (0) { /* debugging daemons <<-- needs work */
|
||||||
|
|
||||||
opal_output_verbose(10, 0,
|
if (orte_debug_flag) {
|
||||||
"Info: Setting up debugger "
|
opal_output(0, "Info: Setting up debugger process table for daemons\n");
|
||||||
"process table for daemons\n");
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -130,9 +176,9 @@ int orte_totalview_init(orte_jobid_t jobid)
|
|||||||
* processes in case someone attaches later.
|
* processes in case someone attaches later.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
opal_output_verbose(10, 0,
|
if (orte_debug_flag) {
|
||||||
"Info: Setting up debugger "
|
opal_output(0, "Info: Setting up debugger process table for applications\n");
|
||||||
"process table for applications\n");
|
}
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&list_of_resource_maps, opal_list_t);
|
OBJ_CONSTRUCT(&list_of_resource_maps, opal_list_t);
|
||||||
|
|
||||||
@ -140,8 +186,8 @@ int orte_totalview_init(orte_jobid_t jobid)
|
|||||||
|
|
||||||
rc = orte_rmaps_base_get_map(jobid, &list_of_resource_maps);
|
rc = orte_rmaps_base_get_map(jobid, &list_of_resource_maps);
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
|
opal_output(0, "Error: Can't get list of resource maps\n");
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* find the total number of processes in the job */
|
/* find the total number of processes in the job */
|
||||||
@ -160,11 +206,8 @@ int orte_totalview_init(orte_jobid_t jobid)
|
|||||||
if (MPIR_proctable == NULL) {
|
if (MPIR_proctable == NULL) {
|
||||||
opal_output(0, "Error: Out of memory\n");
|
opal_output(0, "Error: Out of memory\n");
|
||||||
OBJ_DESTRUCT(&list_of_resource_maps);
|
OBJ_DESTRUCT(&list_of_resource_maps);
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
MPIR_being_debugged = 1;
|
|
||||||
|
|
||||||
/* initialize MPIR_proctable */
|
/* initialize MPIR_proctable */
|
||||||
|
|
||||||
for (item = opal_list_get_first(&list_of_resource_maps);
|
for (item = opal_list_get_first(&list_of_resource_maps);
|
||||||
@ -183,11 +226,7 @@ int orte_totalview_init(orte_jobid_t jobid)
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (1 /* some MCA parameter indicating spawned by debugger */) {
|
if (orte_debug_flag) {
|
||||||
MPIR_debug_state = MPIR_DEBUG_SPAWNED;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1 /* verbose */) {
|
|
||||||
dump();
|
dump();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,7 +19,8 @@
|
|||||||
|
|
||||||
#include "orte_config.h"
|
#include "orte_config.h"
|
||||||
|
|
||||||
int orte_totalview_init(orte_jobid_t jobid);
|
void orte_totalview_init_before_spawn(void);
|
||||||
|
void orte_totalview_init_after_spawn(orte_jobid_t jobid);
|
||||||
void orte_totalview_finalize(void);
|
void orte_totalview_finalize(void);
|
||||||
void *MPIR_Breakpoint(void);
|
void *MPIR_Breakpoint(void);
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user