1
1

Bring forward the debugger-related changes

Refs https://github.com/open-mpi/ompi/pull/2425

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2016-11-29 13:15:20 -08:00
родитель a6d390fe7b
Коммит d5fd635efe
11 изменённых файлов: 95 добавлений и 19 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -353,6 +353,7 @@ orte/mca/sstore/orte_sstore.7
orte/test/mpi/abort
orte/test/mpi/accept
orte/test/mpi/attach
orte/test/mpi/bad_exit
orte/test/mpi/bcast_loop
orte/test/mpi/concurrent_spawn

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2012-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
@ -133,6 +133,8 @@ void ompi_rte_wait_for_debugger(void)
int debugger;
opal_list_t *codes;
opal_value_t *kv;
char *evar;
int time;
/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
@ -152,6 +154,12 @@ void ompi_rte_wait_for_debugger(void)
*/
ompi_debugger_setup_dlls();
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
time = strtol(evar, NULL, 10);
sleep(time);
return;
}
if (orte_standalone_operation) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {

Просмотреть файл

@ -120,6 +120,11 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
* this is a local proc we just haven't heard from
* yet due to a race condition. Check that situation */
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
++msg->retries;
if (msg->retries < orte_rml_base.max_retries) {
ORTE_OOB_SEND(msg);
return;
}
ORTE_OOB_SEND(msg);
return;
}

Просмотреть файл

@ -839,8 +839,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
}
cleanup:
/* need to init_after_spawn for debuggers */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
}
OBJ_RELEASE(caddy);
}

Просмотреть файл

@ -102,6 +102,7 @@ typedef struct {
opal_pointer_array_t conduits; /* array to hold the open conduits */
opal_list_t posted_recvs;
opal_list_t unmatched_msgs;
int max_retries;
#if OPAL_ENABLE_TIMING
bool timing;
#endif
@ -116,6 +117,7 @@ typedef struct {
orte_process_name_t origin;
int status; // returned status on send
orte_rml_tag_t tag; // targeted tag
int retries; // #times we have tried to send it
/* user's send callback functions and data */
union {

Просмотреть файл

@ -63,6 +63,14 @@ static bool selected = false;
static int orte_rml_base_register(mca_base_register_flag_t flags)
{
orte_rml_base.max_retries = 3;
mca_base_var_register("orte", "rml", "base", "max_retries",
"Max #times to retry sending a message",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_rml_base.max_retries);
#if OPAL_ENABLE_TIMING
orte_rml_base.timing = false;
(void) mca_base_var_register ("orte", "rml", "base", "timing",
@ -240,6 +248,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
/*** RML CLASS INSTANCES ***/
static void send_cons(orte_rml_send_t *ptr)
{
ptr->retries = 0;
ptr->cbdata = NULL;
ptr->iov = NULL;
ptr->buffer = NULL;

Просмотреть файл

@ -41,7 +41,8 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
bool takeus = false;
char *t2, *pth, *newenv;
if (NULL != orte_schizo_base.personalities) {
if (NULL != orte_schizo_base.personalities &&
NULL != jdata->personality) {
/* see if we are included */
for (i=0; NULL != jdata->personality[i]; i++) {
if (0 == strcmp(jdata->personality[i], "singularity")) {
@ -106,4 +107,3 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
return ORTE_SUCCESS;
}

Просмотреть файл

@ -896,6 +896,20 @@ int orte_submit_job(char *argv[], int *index,
}
}
/* check for debugger test envars and forward them if necessary */
if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
char *evar;
evar = getenv("ORTE_TEST_DEBUGGER_SLEEP");
for (i=0; i < (int)jdata->num_apps; i++) {
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env);
if (NULL != evar) {
opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env);
}
}
}
}
/* check for suicide test directives */
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
@ -2149,8 +2163,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
*/
if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
opal_output_verbose(2, orte_debug_output,
"%s No debugger test daemon specified",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
"%s Debugger test daemon specified: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_debugger_test_daemon);
goto launchit;
}
/* if we were given an auto-detect rate, then we want to setup
@ -2362,6 +2377,8 @@ static void setup_debugger_job(void)
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = debugger->jobid;
proc->name.vpid = vpid++;
/* point the proc at the local ORTE daemon as its parent */
proc->parent = node->daemon->name.vpid;
/* set the local/node ranks - we don't actually care
* what these are, but the odls needs them
*/
@ -2741,7 +2758,7 @@ static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
static void open_fifo(void)
{
if (orte_debugger_attach_fd > 0) {
close(orte_debugger_attach_fd);
close(orte_debugger_attach_fd);
}
orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
@ -2760,10 +2777,16 @@ static void open_fifo(void)
return;
}
opal_output_verbose(2, orte_debug_output,
"%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
if (orte_debugger_test_attach) {
opal_output(0, "%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
} else {
opal_output_verbose(2, orte_debug_output,
"%s Monitoring debugger attach fifo %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
MPIR_attach_fifo);
}
orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t));
opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd,
OPAL_EV_READ, attach_debugger, orte_debugger_attach);
@ -3232,4 +3255,3 @@ void orte_profile_wakeup(int sd, short args, void *cbdata)
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
}

Просмотреть файл

@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll iof
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach
all: $(PROGS)
@ -10,11 +10,6 @@ hello_output: hello_output.c
hello_show_help: hello_show_help.c
$(CC) $(CFLAGS) $(CFLAGS_INTERNAL) $^ -o $@
hello.sapp: hello.c myhello.spec
$(CC) $(CFLAGS) $(CLAGS_INTERNAL) hello.c -o hello
singularity build myhello.spec
singularity install hello.sapp
CC = mpicc
CFLAGS = -g --openmpi:linkall
CFLAGS_INTERNAL = -I../../.. -I../../../orte/include -I../../../opal/include

30
orte/test/mpi/attach.c Обычный файл
Просмотреть файл

@ -0,0 +1,30 @@
/* -*- C -*-
*
* $HEADER$
*
* The most basic of MPI applications
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char* argv[])
{
unsigned char fifo_cmd = 1;
int fd;
if (1 > argc) {
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
exit(1);
}
fd = open(argv[1], O_WRONLY);
write(fd, &fifo_cmd, sizeof(unsigned char));
close(fd);
return 0;
}

Просмотреть файл

@ -354,6 +354,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "FAULT TOLERANCE RESTART";
case ORTE_JOB_STATE_ANY:
return "ANY";
case ORTE_JOB_STATE_DEBUGGER_DETACH:
return "DEBUGGER DETACH";
default:
return "UNKNOWN STATE!";
}