Bring forward the debugger-related changes
Refs https://github.com/open-mpi/ompi/pull/2425 Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
a6d390fe7b
Коммит
d5fd635efe
1
.gitignore
поставляемый
1
.gitignore
поставляемый
@ -353,6 +353,7 @@ orte/mca/sstore/orte_sstore.7
|
||||
|
||||
orte/test/mpi/abort
|
||||
orte/test/mpi/accept
|
||||
orte/test/mpi/attach
|
||||
orte/test/mpi/bad_exit
|
||||
orte/test/mpi/bcast_loop
|
||||
orte/test/mpi/concurrent_spawn
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
@ -133,6 +133,8 @@ void ompi_rte_wait_for_debugger(void)
|
||||
int debugger;
|
||||
opal_list_t *codes;
|
||||
opal_value_t *kv;
|
||||
char *evar;
|
||||
int time;
|
||||
|
||||
/* See lengthy comment in orte/tools/orterun/debuggers.c about
|
||||
orte_in_parallel_debugger */
|
||||
@ -152,6 +154,12 @@ void ompi_rte_wait_for_debugger(void)
|
||||
*/
|
||||
ompi_debugger_setup_dlls();
|
||||
|
||||
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
|
||||
time = strtol(evar, NULL, 10);
|
||||
sleep(time);
|
||||
return;
|
||||
}
|
||||
|
||||
if (orte_standalone_operation) {
|
||||
/* spin until debugger attaches and releases us */
|
||||
while (MPIR_debug_gate == 0) {
|
||||
|
@ -120,6 +120,11 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
* this is a local proc we just haven't heard from
|
||||
* yet due to a race condition. Check that situation */
|
||||
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
|
||||
++msg->retries;
|
||||
if (msg->retries < orte_rml_base.max_retries) {
|
||||
ORTE_OOB_SEND(msg);
|
||||
return;
|
||||
}
|
||||
ORTE_OOB_SEND(msg);
|
||||
return;
|
||||
}
|
||||
|
@ -839,8 +839,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
|
||||
}
|
||||
|
||||
cleanup:
|
||||
/* need to init_after_spawn for debuggers */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
|
||||
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
|
||||
}
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
@ -102,6 +102,7 @@ typedef struct {
|
||||
opal_pointer_array_t conduits; /* array to hold the open conduits */
|
||||
opal_list_t posted_recvs;
|
||||
opal_list_t unmatched_msgs;
|
||||
int max_retries;
|
||||
#if OPAL_ENABLE_TIMING
|
||||
bool timing;
|
||||
#endif
|
||||
@ -116,6 +117,7 @@ typedef struct {
|
||||
orte_process_name_t origin;
|
||||
int status; // returned status on send
|
||||
orte_rml_tag_t tag; // targeted tag
|
||||
int retries; // #times we have tried to send it
|
||||
|
||||
/* user's send callback functions and data */
|
||||
union {
|
||||
|
@ -63,6 +63,14 @@ static bool selected = false;
|
||||
|
||||
static int orte_rml_base_register(mca_base_register_flag_t flags)
|
||||
{
|
||||
orte_rml_base.max_retries = 3;
|
||||
mca_base_var_register("orte", "rml", "base", "max_retries",
|
||||
"Max #times to retry sending a message",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_rml_base.max_retries);
|
||||
|
||||
#if OPAL_ENABLE_TIMING
|
||||
orte_rml_base.timing = false;
|
||||
(void) mca_base_var_register ("orte", "rml", "base", "timing",
|
||||
@ -240,6 +248,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
|
||||
/*** RML CLASS INSTANCES ***/
|
||||
static void send_cons(orte_rml_send_t *ptr)
|
||||
{
|
||||
ptr->retries = 0;
|
||||
ptr->cbdata = NULL;
|
||||
ptr->iov = NULL;
|
||||
ptr->buffer = NULL;
|
||||
|
@ -41,7 +41,8 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
|
||||
bool takeus = false;
|
||||
char *t2, *pth, *newenv;
|
||||
|
||||
if (NULL != orte_schizo_base.personalities) {
|
||||
if (NULL != orte_schizo_base.personalities &&
|
||||
NULL != jdata->personality) {
|
||||
/* see if we are included */
|
||||
for (i=0; NULL != jdata->personality[i]; i++) {
|
||||
if (0 == strcmp(jdata->personality[i], "singularity")) {
|
||||
@ -106,4 +107,3 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -896,6 +896,20 @@ int orte_submit_job(char *argv[], int *index,
|
||||
}
|
||||
}
|
||||
|
||||
/* check for debugger test envars and forward them if necessary */
|
||||
if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
|
||||
char *evar;
|
||||
evar = getenv("ORTE_TEST_DEBUGGER_SLEEP");
|
||||
for (i=0; i < (int)jdata->num_apps; i++) {
|
||||
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env);
|
||||
if (NULL != evar) {
|
||||
opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* check for suicide test directives */
|
||||
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
|
||||
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
|
||||
@ -2149,8 +2163,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
|
||||
*/
|
||||
if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s No debugger test daemon specified",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
"%s Debugger test daemon specified: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_debugger_test_daemon);
|
||||
goto launchit;
|
||||
}
|
||||
/* if we were given an auto-detect rate, then we want to setup
|
||||
@ -2362,6 +2377,8 @@ static void setup_debugger_job(void)
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
proc->name.jobid = debugger->jobid;
|
||||
proc->name.vpid = vpid++;
|
||||
/* point the proc at the local ORTE daemon as its parent */
|
||||
proc->parent = node->daemon->name.vpid;
|
||||
/* set the local/node ranks - we don't actually care
|
||||
* what these are, but the odls needs them
|
||||
*/
|
||||
@ -2741,7 +2758,7 @@ static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
|
||||
static void open_fifo(void)
|
||||
{
|
||||
if (orte_debugger_attach_fd > 0) {
|
||||
close(orte_debugger_attach_fd);
|
||||
close(orte_debugger_attach_fd);
|
||||
}
|
||||
|
||||
orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
|
||||
@ -2760,10 +2777,16 @@ static void open_fifo(void)
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s Monitoring debugger attach fifo %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
MPIR_attach_fifo);
|
||||
if (orte_debugger_test_attach) {
|
||||
opal_output(0, "%s Monitoring debugger attach fifo %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
MPIR_attach_fifo);
|
||||
} else {
|
||||
opal_output_verbose(2, orte_debug_output,
|
||||
"%s Monitoring debugger attach fifo %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
MPIR_attach_fifo);
|
||||
}
|
||||
orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd,
|
||||
OPAL_EV_READ, attach_debugger, orte_debugger_attach);
|
||||
@ -3232,4 +3255,3 @@ void orte_profile_wakeup(int sd, short args, void *cbdata)
|
||||
/* abort the job */
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll iof
|
||||
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach
|
||||
|
||||
all: $(PROGS)
|
||||
|
||||
@ -10,11 +10,6 @@ hello_output: hello_output.c
|
||||
hello_show_help: hello_show_help.c
|
||||
$(CC) $(CFLAGS) $(CFLAGS_INTERNAL) $^ -o $@
|
||||
|
||||
hello.sapp: hello.c myhello.spec
|
||||
$(CC) $(CFLAGS) $(CLAGS_INTERNAL) hello.c -o hello
|
||||
singularity build myhello.spec
|
||||
singularity install hello.sapp
|
||||
|
||||
CC = mpicc
|
||||
CFLAGS = -g --openmpi:linkall
|
||||
CFLAGS_INTERNAL = -I../../.. -I../../../orte/include -I../../../opal/include
|
||||
|
30
orte/test/mpi/attach.c
Обычный файл
30
orte/test/mpi/attach.c
Обычный файл
@ -0,0 +1,30 @@
|
||||
/* -*- C -*-
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* The most basic of MPI applications
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
unsigned char fifo_cmd = 1;
|
||||
int fd;
|
||||
|
||||
if (1 > argc) {
|
||||
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fd = open(argv[1], O_WRONLY);
|
||||
write(fd, &fifo_cmd, sizeof(unsigned char));
|
||||
close(fd);
|
||||
|
||||
return 0;
|
||||
}
|
@ -354,6 +354,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
return "FAULT TOLERANCE RESTART";
|
||||
case ORTE_JOB_STATE_ANY:
|
||||
return "ANY";
|
||||
case ORTE_JOB_STATE_DEBUGGER_DETACH:
|
||||
return "DEBUGGER DETACH";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user