Cleanup mpirx co-spawn of debugger daemons. Resolve block-until-both-ends-are-opened behavior for fifos. Add some debugging output. Make the fifo event be non-persistent and read the value in the fifo to avoid spinning on the event.
This commit was SVN r23922.
Этот коммит содержится в:
родитель
2c1a658232
Коммит
8a37b3071a
@ -72,6 +72,7 @@ static void attach_debugger(int fd, short event, void *arg);
|
|||||||
static void build_debugger_args(orte_app_context_t *debugger);
|
static void build_debugger_args(orte_app_context_t *debugger);
|
||||||
static opal_event_t attach;
|
static opal_event_t attach;
|
||||||
static int attach_fd;
|
static int attach_fd;
|
||||||
|
static bool fifo_active=false;
|
||||||
|
|
||||||
static int init(void)
|
static int init(void)
|
||||||
{
|
{
|
||||||
@ -84,6 +85,11 @@ static int init(void)
|
|||||||
*/
|
*/
|
||||||
void finalize(void)
|
void finalize(void)
|
||||||
{
|
{
|
||||||
|
if (fifo_active) {
|
||||||
|
opal_event_del(&attach);
|
||||||
|
close(attach_fd);
|
||||||
|
}
|
||||||
|
|
||||||
if (MPIR_proctable) {
|
if (MPIR_proctable) {
|
||||||
free(MPIR_proctable);
|
free(MPIR_proctable);
|
||||||
MPIR_proctable = NULL;
|
MPIR_proctable = NULL;
|
||||||
@ -109,12 +115,19 @@ void init_before_spawn(orte_job_t *jdata)
|
|||||||
* colaunch it
|
* colaunch it
|
||||||
*/
|
*/
|
||||||
if (NULL != orte_debugger_base.test_daemon) {
|
if (NULL != orte_debugger_base.test_daemon) {
|
||||||
|
opal_output_verbose(2, orte_debugger_base.output,
|
||||||
|
"%s No debugger test daemon specified",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
goto launchit;
|
goto launchit;
|
||||||
}
|
}
|
||||||
/* if we were given an auto-detect rate, then we want to setup
|
/* if we were given an auto-detect rate, then we want to setup
|
||||||
* an event so we periodically do the check
|
* an event so we periodically do the check
|
||||||
*/
|
*/
|
||||||
if (0 < orte_debugger_mpirx_check_rate) {
|
if (0 < orte_debugger_mpirx_check_rate) {
|
||||||
|
opal_output_verbose(2, orte_debugger_base.output,
|
||||||
|
"%s Setting debugger attach check rate for %d seconds",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
orte_debugger_mpirx_check_rate);
|
||||||
ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
|
ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
|
||||||
} else {
|
} else {
|
||||||
/* create the attachment FIFO and put it into MPIR, setup readevent */
|
/* create the attachment FIFO and put it into MPIR, setup readevent */
|
||||||
@ -122,23 +135,34 @@ void init_before_spawn(orte_job_t *jdata)
|
|||||||
/* create a FIFO name in the session dir */
|
/* create a FIFO name in the session dir */
|
||||||
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
|
attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL);
|
||||||
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
|
if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) {
|
||||||
opal_output(0, "CANNOT CREATE FIFO");
|
opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno);
|
||||||
free(attach_fifo);
|
free(attach_fifo);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH);
|
strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH);
|
||||||
attach_fd = open(attach_fifo, O_RDONLY, 0);
|
attach_fd = open(attach_fifo, O_RDONLY | O_NONBLOCK, 0);
|
||||||
|
if (attach_fd < 0) {
|
||||||
|
opal_output(0, "%s unable to open debugger attach fifo",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
free(attach_fifo);
|
free(attach_fifo);
|
||||||
opal_event_set(&attach, attach_fd, OPAL_EV_READ|OPAL_EV_PERSIST, attach_debugger, NULL);
|
return;
|
||||||
|
}
|
||||||
|
opal_output_verbose(2, orte_debugger_base.output,
|
||||||
|
"%s Monitoring debugger attach fifo %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
attach_fifo);
|
||||||
|
free(attach_fifo);
|
||||||
|
fifo_active = true;
|
||||||
|
opal_event_set(&attach, attach_fd, OPAL_EV_READ, attach_debugger, NULL);
|
||||||
opal_event_add(&attach, 0);
|
opal_event_add(&attach, 0);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
launchit:
|
launchit:
|
||||||
if (orte_debug_flag) {
|
opal_output_verbose(2, orte_debugger_base.output,
|
||||||
opal_output(0, "Info: Spawned by a debugger");
|
"%s: Spawned by a debugger",
|
||||||
}
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
|
||||||
/* tell the procs they are being debugged */
|
/* tell the procs they are being debugged */
|
||||||
env_name = mca_base_param_environ_variable("orte",
|
env_name = mca_base_param_environ_variable("orte",
|
||||||
@ -162,6 +186,11 @@ void init_before_spawn(orte_job_t *jdata)
|
|||||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
opal_output_verbose(2, orte_debugger_base.output,
|
||||||
|
"%s Cospawning debugger daemons %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
(NULL == orte_debugger_base.test_daemon) ?
|
||||||
|
MPIR_executable_path : orte_debugger_base.test_daemon);
|
||||||
/* add debugger info to launch message */
|
/* add debugger info to launch message */
|
||||||
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
orte_debugger_daemon = OBJ_NEW(orte_job_t);
|
||||||
/* create a jobid for these daemons - this is done solely
|
/* create a jobid for these daemons - this is done solely
|
||||||
@ -219,6 +248,14 @@ static void attach_debugger(int fd, short event, void *arg)
|
|||||||
goto RELEASE;
|
goto RELEASE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (fifo_active) {
|
||||||
|
fifo_active = false;
|
||||||
|
read(attach_fd, &rc, sizeof(rc));
|
||||||
|
if (1 != rc) {
|
||||||
|
/* ignore the cmd */
|
||||||
|
goto RELEASE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* a debugger has attached! All the MPIR_Proctable
|
/* a debugger has attached! All the MPIR_Proctable
|
||||||
* data is already available, so we only need to
|
* data is already available, so we only need to
|
||||||
@ -232,6 +269,11 @@ static void attach_debugger(int fd, short event, void *arg)
|
|||||||
"-------------------------------------------\n");
|
"-------------------------------------------\n");
|
||||||
goto RELEASE;
|
goto RELEASE;
|
||||||
}
|
}
|
||||||
|
opal_output_verbose(2, orte_debugger_base.output,
|
||||||
|
"%s Spawning debugger daemons %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
(NULL == orte_debugger_base.test_daemon) ?
|
||||||
|
MPIR_executable_path : orte_debugger_base.test_daemon);
|
||||||
/* this will be launched just like a regular job,
|
/* this will be launched just like a regular job,
|
||||||
* so we do not use the global orte_debugger_daemon
|
* so we do not use the global orte_debugger_daemon
|
||||||
* as this is reserved for co-location upon startup
|
* as this is reserved for co-location upon startup
|
||||||
@ -285,6 +327,7 @@ static void attach_debugger(int fd, short event, void *arg)
|
|||||||
now.tv_usec = 0;
|
now.tv_usec = 0;
|
||||||
opal_evtimer_add(check, &now);
|
opal_evtimer_add(check, &now);
|
||||||
} else {
|
} else {
|
||||||
|
fifo_active = true;
|
||||||
opal_event_add(&attach, 0);
|
opal_event_add(&attach, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user