Bring forward the debugger-related changes
Refs https://github.com/open-mpi/ompi/pull/2425 Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
a6d390fe7b
Коммит
d5fd635efe
1
.gitignore
поставляемый
1
.gitignore
поставляемый
@ -353,6 +353,7 @@ orte/mca/sstore/orte_sstore.7
|
|||||||
|
|
||||||
orte/test/mpi/abort
|
orte/test/mpi/abort
|
||||||
orte/test/mpi/accept
|
orte/test/mpi/accept
|
||||||
|
orte/test/mpi/attach
|
||||||
orte/test/mpi/bad_exit
|
orte/test/mpi/bad_exit
|
||||||
orte/test/mpi/bcast_loop
|
orte/test/mpi/bcast_loop
|
||||||
orte/test/mpi/concurrent_spawn
|
orte/test/mpi/concurrent_spawn
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012-2014 The University of Tennessee and The University
|
* Copyright (c) 2012-2014 The University of Tennessee and The University
|
||||||
* of Tennessee Research Foundation. All rights
|
* of Tennessee Research Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
@ -133,6 +133,8 @@ void ompi_rte_wait_for_debugger(void)
|
|||||||
int debugger;
|
int debugger;
|
||||||
opal_list_t *codes;
|
opal_list_t *codes;
|
||||||
opal_value_t *kv;
|
opal_value_t *kv;
|
||||||
|
char *evar;
|
||||||
|
int time;
|
||||||
|
|
||||||
/* See lengthy comment in orte/tools/orterun/debuggers.c about
|
/* See lengthy comment in orte/tools/orterun/debuggers.c about
|
||||||
orte_in_parallel_debugger */
|
orte_in_parallel_debugger */
|
||||||
@ -152,6 +154,12 @@ void ompi_rte_wait_for_debugger(void)
|
|||||||
*/
|
*/
|
||||||
ompi_debugger_setup_dlls();
|
ompi_debugger_setup_dlls();
|
||||||
|
|
||||||
|
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
|
||||||
|
time = strtol(evar, NULL, 10);
|
||||||
|
sleep(time);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (orte_standalone_operation) {
|
if (orte_standalone_operation) {
|
||||||
/* spin until debugger attaches and releases us */
|
/* spin until debugger attaches and releases us */
|
||||||
while (MPIR_debug_gate == 0) {
|
while (MPIR_debug_gate == 0) {
|
||||||
|
@ -120,6 +120,11 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
|||||||
* this is a local proc we just haven't heard from
|
* this is a local proc we just haven't heard from
|
||||||
* yet due to a race condition. Check that situation */
|
* yet due to a race condition. Check that situation */
|
||||||
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
|
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
|
||||||
|
++msg->retries;
|
||||||
|
if (msg->retries < orte_rml_base.max_retries) {
|
||||||
|
ORTE_OOB_SEND(msg);
|
||||||
|
return;
|
||||||
|
}
|
||||||
ORTE_OOB_SEND(msg);
|
ORTE_OOB_SEND(msg);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -839,8 +839,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
/* need to init_after_spawn for debuggers */
|
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
|
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||||
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
|
||||||
|
}
|
||||||
|
|
||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
}
|
}
|
||||||
|
@ -102,6 +102,7 @@ typedef struct {
|
|||||||
opal_pointer_array_t conduits; /* array to hold the open conduits */
|
opal_pointer_array_t conduits; /* array to hold the open conduits */
|
||||||
opal_list_t posted_recvs;
|
opal_list_t posted_recvs;
|
||||||
opal_list_t unmatched_msgs;
|
opal_list_t unmatched_msgs;
|
||||||
|
int max_retries;
|
||||||
#if OPAL_ENABLE_TIMING
|
#if OPAL_ENABLE_TIMING
|
||||||
bool timing;
|
bool timing;
|
||||||
#endif
|
#endif
|
||||||
@ -116,6 +117,7 @@ typedef struct {
|
|||||||
orte_process_name_t origin;
|
orte_process_name_t origin;
|
||||||
int status; // returned status on send
|
int status; // returned status on send
|
||||||
orte_rml_tag_t tag; // targeted tag
|
orte_rml_tag_t tag; // targeted tag
|
||||||
|
int retries; // #times we have tried to send it
|
||||||
|
|
||||||
/* user's send callback functions and data */
|
/* user's send callback functions and data */
|
||||||
union {
|
union {
|
||||||
|
@ -63,6 +63,14 @@ static bool selected = false;
|
|||||||
|
|
||||||
static int orte_rml_base_register(mca_base_register_flag_t flags)
|
static int orte_rml_base_register(mca_base_register_flag_t flags)
|
||||||
{
|
{
|
||||||
|
orte_rml_base.max_retries = 3;
|
||||||
|
mca_base_var_register("orte", "rml", "base", "max_retries",
|
||||||
|
"Max #times to retry sending a message",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_9,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&orte_rml_base.max_retries);
|
||||||
|
|
||||||
#if OPAL_ENABLE_TIMING
|
#if OPAL_ENABLE_TIMING
|
||||||
orte_rml_base.timing = false;
|
orte_rml_base.timing = false;
|
||||||
(void) mca_base_var_register ("orte", "rml", "base", "timing",
|
(void) mca_base_var_register ("orte", "rml", "base", "timing",
|
||||||
@ -240,6 +248,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
|
|||||||
/*** RML CLASS INSTANCES ***/
|
/*** RML CLASS INSTANCES ***/
|
||||||
static void send_cons(orte_rml_send_t *ptr)
|
static void send_cons(orte_rml_send_t *ptr)
|
||||||
{
|
{
|
||||||
|
ptr->retries = 0;
|
||||||
ptr->cbdata = NULL;
|
ptr->cbdata = NULL;
|
||||||
ptr->iov = NULL;
|
ptr->iov = NULL;
|
||||||
ptr->buffer = NULL;
|
ptr->buffer = NULL;
|
||||||
|
@ -41,7 +41,8 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
|
|||||||
bool takeus = false;
|
bool takeus = false;
|
||||||
char *t2, *pth, *newenv;
|
char *t2, *pth, *newenv;
|
||||||
|
|
||||||
if (NULL != orte_schizo_base.personalities) {
|
if (NULL != orte_schizo_base.personalities &&
|
||||||
|
NULL != jdata->personality) {
|
||||||
/* see if we are included */
|
/* see if we are included */
|
||||||
for (i=0; NULL != jdata->personality[i]; i++) {
|
for (i=0; NULL != jdata->personality[i]; i++) {
|
||||||
if (0 == strcmp(jdata->personality[i], "singularity")) {
|
if (0 == strcmp(jdata->personality[i], "singularity")) {
|
||||||
@ -106,4 +107,3 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
|
|||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -896,6 +896,20 @@ int orte_submit_job(char *argv[], int *index,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* check for debugger test envars and forward them if necessary */
|
||||||
|
if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
|
||||||
|
char *evar;
|
||||||
|
evar = getenv("ORTE_TEST_DEBUGGER_SLEEP");
|
||||||
|
for (i=0; i < (int)jdata->num_apps; i++) {
|
||||||
|
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||||
|
opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env);
|
||||||
|
if (NULL != evar) {
|
||||||
|
opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* check for suicide test directives */
|
/* check for suicide test directives */
|
||||||
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
|
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
|
||||||
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
|
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
|
||||||
@ -2149,8 +2163,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
|||||||
*/
|
*/
|
||||||
if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
|
if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
|
||||||
opal_output_verbose(2, orte_debug_output,
|
opal_output_verbose(2, orte_debug_output,
|
||||||
"%s No debugger test daemon specified",
|
"%s Debugger test daemon specified: %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
orte_debugger_test_daemon);
|
||||||
goto launchit;
|
goto launchit;
|
||||||
}
|
}
|
||||||
/* if we were given an auto-detect rate, then we want to setup
|
/* if we were given an auto-detect rate, then we want to setup
|
||||||
@ -2362,6 +2377,8 @@ static void setup_debugger_job(void)
|
|||||||
proc = OBJ_NEW(orte_proc_t);
|
proc = OBJ_NEW(orte_proc_t);
|
||||||
proc->name.jobid = debugger->jobid;
|
proc->name.jobid = debugger->jobid;
|
||||||
proc->name.vpid = vpid++;
|
proc->name.vpid = vpid++;
|
||||||
|
/* point the proc at the local ORTE daemon as its parent */
|
||||||
|
proc->parent = node->daemon->name.vpid;
|
||||||
/* set the local/node ranks - we don't actually care
|
/* set the local/node ranks - we don't actually care
|
||||||
* what these are, but the odls needs them
|
* what these are, but the odls needs them
|
||||||
*/
|
*/
|
||||||
@ -2741,7 +2758,7 @@ static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
|
|||||||
static void open_fifo(void)
|
static void open_fifo(void)
|
||||||
{
|
{
|
||||||
if (orte_debugger_attach_fd > 0) {
|
if (orte_debugger_attach_fd > 0) {
|
||||||
close(orte_debugger_attach_fd);
|
close(orte_debugger_attach_fd);
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
|
orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
|
||||||
@ -2760,10 +2777,16 @@ static void open_fifo(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_output_verbose(2, orte_debug_output,
|
if (orte_debugger_test_attach) {
|
||||||
"%s Monitoring debugger attach fifo %s",
|
opal_output(0, "%s Monitoring debugger attach fifo %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
MPIR_attach_fifo);
|
MPIR_attach_fifo);
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(2, orte_debug_output,
|
||||||
|
"%s Monitoring debugger attach fifo %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
MPIR_attach_fifo);
|
||||||
|
}
|
||||||
orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t));
|
orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||||
opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd,
|
opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd,
|
||||||
OPAL_EV_READ, attach_debugger, orte_debugger_attach);
|
OPAL_EV_READ, attach_debugger, orte_debugger_attach);
|
||||||
@ -3232,4 +3255,3 @@ void orte_profile_wakeup(int sd, short args, void *cbdata)
|
|||||||
/* abort the job */
|
/* abort the job */
|
||||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll iof
|
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach
|
||||||
|
|
||||||
all: $(PROGS)
|
all: $(PROGS)
|
||||||
|
|
||||||
@ -10,11 +10,6 @@ hello_output: hello_output.c
|
|||||||
hello_show_help: hello_show_help.c
|
hello_show_help: hello_show_help.c
|
||||||
$(CC) $(CFLAGS) $(CFLAGS_INTERNAL) $^ -o $@
|
$(CC) $(CFLAGS) $(CFLAGS_INTERNAL) $^ -o $@
|
||||||
|
|
||||||
hello.sapp: hello.c myhello.spec
|
|
||||||
$(CC) $(CFLAGS) $(CLAGS_INTERNAL) hello.c -o hello
|
|
||||||
singularity build myhello.spec
|
|
||||||
singularity install hello.sapp
|
|
||||||
|
|
||||||
CC = mpicc
|
CC = mpicc
|
||||||
CFLAGS = -g --openmpi:linkall
|
CFLAGS = -g --openmpi:linkall
|
||||||
CFLAGS_INTERNAL = -I../../.. -I../../../orte/include -I../../../opal/include
|
CFLAGS_INTERNAL = -I../../.. -I../../../orte/include -I../../../opal/include
|
||||||
|
30
orte/test/mpi/attach.c
Обычный файл
30
orte/test/mpi/attach.c
Обычный файл
@ -0,0 +1,30 @@
|
|||||||
|
/* -*- C -*-
|
||||||
|
*
|
||||||
|
* $HEADER$
|
||||||
|
*
|
||||||
|
* The most basic of MPI applications
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
unsigned char fifo_cmd = 1;
|
||||||
|
int fd;
|
||||||
|
|
||||||
|
if (1 > argc) {
|
||||||
|
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fd = open(argv[1], O_WRONLY);
|
||||||
|
write(fd, &fifo_cmd, sizeof(unsigned char));
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -354,6 +354,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
|||||||
return "FAULT TOLERANCE RESTART";
|
return "FAULT TOLERANCE RESTART";
|
||||||
case ORTE_JOB_STATE_ANY:
|
case ORTE_JOB_STATE_ANY:
|
||||||
return "ANY";
|
return "ANY";
|
||||||
|
case ORTE_JOB_STATE_DEBUGGER_DETACH:
|
||||||
|
return "DEBUGGER DETACH";
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN STATE!";
|
return "UNKNOWN STATE!";
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user