1
1

Modify the wakeup system to make it more Windows-friendly. This allows Shiqing to consolidate the Windows-specific modifications into one location, and generalizes the wakeup procedure in case we hit other system-specific requirements.

This needs some soak time to ensure we haven't opened any race conditions. I tried to loop everything in the shutdown procedure through that trigger event call to ensure it all goes through the one-time locks as it did before so that someone hitting ctrl-c when we are already shutting down shouldn't cause problems. Just want to let people use it for awhile to verify.

This commit was SVN r19159.
Этот коммит содержится в:
Ralph Castain 2008-08-05 15:09:29 +00:00
родитель 2bd9ddfc28
Коммит be02211b4f
26 изменённых файлов: 131 добавлений и 217 удалений

Просмотреть файл

@ -29,7 +29,7 @@
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wakeup.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_locks.h" #include "orte/runtime/orte_locks.h"
#include "orte/mca/plm/plm.h" #include "orte/mca/plm/plm.h"
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
@ -101,9 +101,7 @@ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
ORTE_UPDATE_EXIT_STATUS(exit_code); ORTE_UPDATE_EXIT_STATUS(exit_code);
/* wakeup orterun so we can exit */ /* wakeup orterun so we can exit */
if (ORTE_SUCCESS != (rc = orte_wakeup())) { orte_trigger_event(&orte_exit);
ORTE_ERROR_LOG(rc);
}
} }
/* /*
@ -144,9 +142,7 @@ void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code)
ORTE_UPDATE_EXIT_STATUS(exit_code); ORTE_UPDATE_EXIT_STATUS(exit_code);
/* wakeup orterun so we can exit */ /* wakeup orterun so we can exit */
if (ORTE_SUCCESS != (rc = orte_wakeup())) { orte_trigger_event(&orte_exit);
ORTE_ERROR_LOG(rc);
}
} }
/* /*

Просмотреть файл

@ -47,7 +47,7 @@
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wakeup.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/filem/filem.h" #include "orte/mca/filem/filem.h"
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
@ -190,7 +190,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
if (NULL == (jdata = orte_get_job_data_object(name.jobid))) { if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_UPDATE_EXIT_STATUS(1); ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup(); orte_trigger_event(&orte_exit);
goto CLEANUP; goto CLEANUP;
} }
/* get the proc object for it */ /* get the proc object for it */
@ -198,7 +198,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) { if (NULL == procs[name.vpid] || NULL == procs[name.vpid]->node) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ORTE_UPDATE_EXIT_STATUS(1); ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup(); orte_trigger_event(&orte_exit);
goto CLEANUP; goto CLEANUP;
} }
@ -208,7 +208,7 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &(procs[name.vpid]->node->name), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
ORTE_UPDATE_EXIT_STATUS(1); ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup(); orte_trigger_event(&orte_exit);
goto CLEANUP; goto CLEANUP;
} }
@ -294,13 +294,13 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender,
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &tmp_name, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
ORTE_UPDATE_EXIT_STATUS(1); ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup(); orte_trigger_event(&orte_exit);
goto CLEANUP; goto CLEANUP;
} }
if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&answer, &file_type, 1, OPAL_INT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
ORTE_UPDATE_EXIT_STATUS(1); ORTE_UPDATE_EXIT_STATUS(1);
orte_wakeup(); orte_trigger_event(&orte_exit);
goto CLEANUP; goto CLEANUP;
} }

Просмотреть файл

@ -61,7 +61,6 @@
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps.h"

Просмотреть файл

@ -31,7 +31,6 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"

Просмотреть файл

@ -43,7 +43,6 @@
#if OPAL_ENABLE_FT == 1 #if OPAL_ENABLE_FT == 1
#include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/snapc.h"
#endif #endif
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_locks.h" #include "orte/runtime/orte_locks.h"
@ -299,7 +298,7 @@ void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid,
WAKEUP: WAKEUP:
/* set orterun's exit code and wakeup so it can exit */ /* set orterun's exit code and wakeup so it can exit */
ORTE_UPDATE_EXIT_STATUS(status); ORTE_UPDATE_EXIT_STATUS(status);
orte_wakeup(); orte_trigger_event(&orte_exit);
} }
static void process_orted_launch_report(int fd, short event, void *data) static void process_orted_launch_report(int fd, short event, void *data)
@ -1035,9 +1034,8 @@ CHECK_ALL_JOBS:
jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
if (jdata->num_terminated >= jdata->num_procs) { if (jdata->num_terminated >= jdata->num_procs) {
/* orteds are done! */ /* orteds are done! */
int data=1;
jdata->state = ORTE_JOB_STATE_TERMINATED; jdata->state = ORTE_JOB_STATE_TERMINATED;
write(orteds_exit, &data, sizeof(int)); orte_trigger_event(&orteds_exit);
return; return;
} }
} }
@ -1067,7 +1065,7 @@ CHECK_ALL_JOBS:
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:check_job_completed all jobs terminated - waking up", "%s plm:base:check_job_completed all jobs terminated - waking up",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
orte_wakeup(); orte_trigger_event(&orte_exit);
} }
} }

Просмотреть файл

@ -45,7 +45,6 @@
#include "orte/mca/routed/routed.h" #include "orte/mca/routed/routed.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/plm/plm_types.h" #include "orte/mca/plm/plm_types.h"
@ -279,7 +278,7 @@ void orte_plm_base_receive_process_msg(int fd, short event, void *data)
/* see if an error occurred - if so, wakeup so we can exit */ /* see if an error occurred - if so, wakeup so we can exit */
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
orte_wakeup(); orte_trigger_event(&orte_exit);
} }
} }

Просмотреть файл

@ -64,7 +64,6 @@
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"

Просмотреть файл

@ -73,7 +73,6 @@
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"

Просмотреть файл

@ -74,7 +74,6 @@
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/nidmap.h" #include "orte/util/nidmap.h"

Просмотреть файл

@ -61,7 +61,6 @@
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps.h"

Просмотреть файл

@ -66,7 +66,6 @@
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/params.h" #include "orte/runtime/params.h"
#include "opal/dss/dss.h" #include "opal/dss/dss.h"

Просмотреть файл

@ -65,7 +65,6 @@
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps.h"

Просмотреть файл

@ -65,7 +65,6 @@
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h" #include "orte/mca/rmaps/rmaps.h"

Просмотреть файл

@ -29,7 +29,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wakeup.h" #include "orte/runtime/orte_wait.h"
#include "orte/util/hostfile/hostfile.h" #include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h" #include "orte/util/dash_host/dash_host.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
@ -118,7 +118,7 @@ int orte_ras_base_allocate(orte_job_t *jdata)
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_wakeup(); orte_trigger_event(&orte_exit);
return ORTE_ERROR; return ORTE_ERROR;
} }

Просмотреть файл

@ -74,7 +74,6 @@
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/orted/orted.h" #include "orte/orted/orted.h"
@ -640,7 +639,7 @@ static int process_commands(orte_process_name_t* sender,
* NOTE: this event will fire -after- any zero-time events * NOTE: this event will fire -after- any zero-time events
* so any pending relays -do- get sent first * so any pending relays -do- get sent first
*/ */
orte_wakeup(); orte_trigger_event(&orte_exit);
return ORTE_SUCCESS; return ORTE_SUCCESS;
break; break;
@ -654,7 +653,7 @@ static int process_commands(orte_process_name_t* sender,
* NOTE: this event will fire -after- any zero-time events * NOTE: this event will fire -after- any zero-time events
* so any pending relays -do- get sent first * so any pending relays -do- get sent first
*/ */
orte_wakeup(); orte_trigger_event(&orte_exit);
return ORTE_SUCCESS; return ORTE_SUCCESS;
break; break;

Просмотреть файл

@ -94,6 +94,7 @@ char *log_path = NULL;
static opal_event_t *orted_exit_event; static opal_event_t *orted_exit_event;
static void shutdown_callback(int fd, short flags, void *arg); static void shutdown_callback(int fd, short flags, void *arg);
static void shutdown_signal(int fd, short flags, void *arg);
static void signal_callback(int fd, short event, void *arg); static void signal_callback(int fd, short event, void *arg);
static void clean_fail(int fd, short flags, void *arg); static void clean_fail(int fd, short flags, void *arg);
@ -402,10 +403,10 @@ int orte_daemon(int argc, char *argv[])
* after ourselves. * after ourselves.
*/ */
opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL, opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL,
shutdown_callback, NULL); shutdown_signal, NULL);
opal_event_add(&term_handler, NULL); opal_event_add(&term_handler, NULL);
opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL, opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL,
shutdown_callback, NULL); shutdown_signal, NULL);
opal_event_add(&int_handler, NULL); opal_event_add(&int_handler, NULL);
#ifndef __WINDOWS__ #ifndef __WINDOWS__
@ -666,15 +667,19 @@ static void clean_fail(int fd, short flags, void *arg)
exit(ORTE_ERROR_DEFAULT_EXIT_CODE); exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
} }
static void shutdown_signal(int fd, short flags, void *arg)
{
/* trigger the call to shutdown callback to protect
* against race conditions - the trigger event will
* check the one-time lock
*/
orte_trigger_event(&orte_exit);
}
static void shutdown_callback(int fd, short flags, void *arg) static void shutdown_callback(int fd, short flags, void *arg)
{ {
int ret; int ret;
/* protect against multiple calls */
if (!opal_atomic_trylock(&orted_exit_lock)) { /* returns 1 if already locked */
return;
}
if (NULL != arg) { if (NULL != arg) {
/* it's the singleton pipe... remove that handler */ /* it's the singleton pipe... remove that handler */
opal_event_del(&pipe_handler); opal_event_del(&pipe_handler);

Просмотреть файл

@ -36,7 +36,6 @@ if !ORTE_DISABLE_FULL_SUPPORT
headers += \ headers += \
runtime/runtime_internals.h \ runtime/runtime_internals.h \
runtime/orte_wait.h \ runtime/orte_wait.h \
runtime/orte_wakeup.h \
runtime/orte_globals.h \ runtime/orte_globals.h \
runtime/orte_globals_class_instances.h \ runtime/orte_globals_class_instances.h \
runtime/orte_cr.h \ runtime/orte_cr.h \
@ -53,7 +52,6 @@ libopen_rte_la_SOURCES += \
runtime/data_type_support/orte_dt_packing_fns.c \ runtime/data_type_support/orte_dt_packing_fns.c \
runtime/data_type_support/orte_dt_unpacking_fns.c \ runtime/data_type_support/orte_dt_unpacking_fns.c \
runtime/orte_wait.c \ runtime/orte_wait.c \
runtime/orte_wakeup.c \
runtime/orte_cr.c \ runtime/orte_cr.c \
runtime/orte_data_server.c runtime/orte_data_server.c
endif endif

Просмотреть файл

@ -68,7 +68,8 @@ bool orte_allocation_required;
char *orte_launch_agent; char *orte_launch_agent;
char **orted_cmd_line=NULL; char **orted_cmd_line=NULL;
int orte_exit, orteds_exit;
orte_trigger_event_t orte_exit, orteds_exit;
int orte_exit_status = 0; int orte_exit_status = 0;
bool orte_abnormal_term_ordered = false; bool orte_abnormal_term_ordered = false;
bool orte_shutdown_in_progress = false; bool orte_shutdown_in_progress = false;

Просмотреть файл

@ -42,6 +42,7 @@
#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rmaps/rmaps_types.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
#define ORTE_GLOBAL_ARRAY_BLOCK_SIZE 64 #define ORTE_GLOBAL_ARRAY_BLOCK_SIZE 64
#define ORTE_GLOBAL_ARRAY_MAX_SIZE INT_MAX #define ORTE_GLOBAL_ARRAY_MAX_SIZE INT_MAX
@ -51,6 +52,36 @@
BEGIN_C_DECLS BEGIN_C_DECLS
/**
* Define a macro for updating the orte_exit_status
* The macro provides a convenient way of doing this
* so that we can add thread locking at some point
* since the orte_exit_status is a global variable.
*
* Ensure that we do not overwrite the exit status if it has
* already been set to some non-zero value. If we don't make
* this check, then different parts of the code could overwrite
* each other's exit status in the case of abnormal termination.
*
* For example, if a process aborts, we would record the initial
* exit code from the aborted process. However, subsequent processes
* will have been aborted by signal as we kill the job. We don't want
* the subsequent processes to overwrite the original exit code so
* we can tell the user the exit code from the process that caused
* the whole thing to happen.
*/
#define ORTE_UPDATE_EXIT_STATUS(newstatus) \
do { \
if (0 == orte_exit_status && 0 != newstatus) { \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"%s:%s(%d) updating exit status to %d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, newstatus)); \
orte_exit_status = newstatus; \
} \
} while(0);
ORTE_DECLSPEC extern bool orte_help_want_aggregate; ORTE_DECLSPEC extern bool orte_help_want_aggregate;
ORTE_DECLSPEC extern char *orte_prohibited_session_dirs; ORTE_DECLSPEC extern char *orte_prohibited_session_dirs;
@ -353,7 +384,9 @@ ORTE_DECLSPEC extern bool orte_allocation_required;
ORTE_DECLSPEC extern char *orte_launch_agent; ORTE_DECLSPEC extern char *orte_launch_agent;
ORTE_DECLSPEC extern char **orted_cmd_line; ORTE_DECLSPEC extern char **orted_cmd_line;
ORTE_DECLSPEC extern int orte_exit, orteds_exit;
/* exit triggers and flags */
ORTE_DECLSPEC extern orte_trigger_event_t orte_exit, orteds_exit;
ORTE_DECLSPEC extern int orte_exit_status; ORTE_DECLSPEC extern int orte_exit_status;
ORTE_DECLSPEC extern bool orte_abnormal_term_ordered; ORTE_DECLSPEC extern bool orte_abnormal_term_ordered;
ORTE_DECLSPEC extern bool orte_shutdown_in_progress; ORTE_DECLSPEC extern bool orte_shutdown_in_progress;

Просмотреть файл

@ -29,7 +29,6 @@ opal_atomic_lock_t orte_finalize_lock;
opal_atomic_lock_t orted_exit_lock; opal_atomic_lock_t orted_exit_lock;
/* for HNPs */ /* for HNPs */
opal_atomic_lock_t orte_wakeup_lock;
opal_atomic_lock_t orte_job_complete_lock; opal_atomic_lock_t orte_job_complete_lock;
opal_atomic_lock_t orte_terminate_lock; opal_atomic_lock_t orte_terminate_lock;
opal_atomic_lock_t orte_abort_inprogress_lock; opal_atomic_lock_t orte_abort_inprogress_lock;
@ -44,7 +43,6 @@ int orte_locks_init(void)
opal_atomic_init(&orted_exit_lock, OPAL_ATOMIC_UNLOCKED); opal_atomic_init(&orted_exit_lock, OPAL_ATOMIC_UNLOCKED);
/* for HNPs */ /* for HNPs */
opal_atomic_init(&orte_wakeup_lock, OPAL_ATOMIC_UNLOCKED);
opal_atomic_init(&orte_job_complete_lock, OPAL_ATOMIC_UNLOCKED); opal_atomic_init(&orte_job_complete_lock, OPAL_ATOMIC_UNLOCKED);
opal_atomic_init(&orte_terminate_lock, OPAL_ATOMIC_UNLOCKED); opal_atomic_init(&orte_terminate_lock, OPAL_ATOMIC_UNLOCKED);
opal_atomic_init(&orte_abort_inprogress_lock, OPAL_ATOMIC_UNLOCKED); opal_atomic_init(&orte_abort_inprogress_lock, OPAL_ATOMIC_UNLOCKED);

Просмотреть файл

@ -38,7 +38,6 @@ ORTE_DECLSPEC extern opal_atomic_lock_t orte_finalize_lock;
ORTE_DECLSPEC extern opal_atomic_lock_t orted_exit_lock; ORTE_DECLSPEC extern opal_atomic_lock_t orted_exit_lock;
/* for HNPs */ /* for HNPs */
ORTE_DECLSPEC extern opal_atomic_lock_t orte_wakeup_lock;
ORTE_DECLSPEC extern opal_atomic_lock_t orte_job_complete_lock; ORTE_DECLSPEC extern opal_atomic_lock_t orte_job_complete_lock;
ORTE_DECLSPEC extern opal_atomic_lock_t orte_terminate_lock; ORTE_DECLSPEC extern opal_atomic_lock_t orte_terminate_lock;
ORTE_DECLSPEC extern opal_atomic_lock_t orte_abort_inprogress_lock; ORTE_DECLSPEC extern opal_atomic_lock_t orte_abort_inprogress_lock;

Просмотреть файл

@ -48,13 +48,14 @@
#endif #endif
#include "opal/dss/dss_types.h" #include "opal/dss/dss_types.h"
#include "orte/util/show_help.h"
#include "opal/class/opal_object.h" #include "opal/class/opal_object.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "opal/event/event.h" #include "opal/event/event.h"
#include "opal/threads/mutex.h" #include "opal/threads/mutex.h"
#include "opal/threads/condition.h" #include "opal/threads/condition.h"
#include "opal/sys/atomic.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
@ -95,6 +96,24 @@ OBJ_CLASS_INSTANCE(orte_message_event_t,
message_event_constructor, message_event_constructor,
message_event_destructor); message_event_destructor);
static void trigger_event_destructor(orte_trigger_event_t *trig)
{
if (0 <= trig->channel) {
close(trig->channel);
}
}
static void trigger_event_constructor(orte_trigger_event_t *trig)
{
trig->channel = -1;
opal_atomic_init(&trig->lock, OPAL_ATOMIC_UNLOCKED);
}
OBJ_CLASS_INSTANCE(orte_trigger_event_t,
opal_object_t,
trigger_event_constructor,
trigger_event_destructor);
#ifdef HAVE_WAITPID #ifdef HAVE_WAITPID
static volatile int cb_enabled = true; static volatile int cb_enabled = true;
@ -458,7 +477,7 @@ orte_wait_cb_enable()
} }
int orte_wait_event(opal_event_t **event, int *trig, int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig,
void (*cbfunc)(int, short, void*)) void (*cbfunc)(int, short, void*))
{ {
int p[2]; int p[2];
@ -471,8 +490,11 @@ int orte_wait_event(opal_event_t **event, int *trig,
/* create the event */ /* create the event */
*event = (opal_event_t*)malloc(sizeof(opal_event_t)); *event = (opal_event_t*)malloc(sizeof(opal_event_t));
/* setup the trigger and its associated lock */
OBJ_CONSTRUCT(trig, orte_trigger_event_t);
/* pass back the write end of the pipe */ /* pass back the write end of the pipe */
*trig = p[1]; trig->channel = p[1];
/* define the event to fire when someone writes to the pipe */ /* define the event to fire when someone writes to the pipe */
opal_event_set(*event, p[0], OPAL_EV_READ, cbfunc, NULL); opal_event_set(*event, p[0], OPAL_EV_READ, cbfunc, NULL);
@ -485,11 +507,15 @@ int orte_wait_event(opal_event_t **event, int *trig,
} }
void orte_trigger_event(int trig) void orte_trigger_event(orte_trigger_event_t *trig)
{ {
int data=1; int data=1;
write(trig, &data, sizeof(int)); if (!opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */
return;
}
write(trig->channel, &data, sizeof(int));
opal_progress(); opal_progress();
} }
@ -792,11 +818,15 @@ orte_wait_finalize(void)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
void orte_trigger_event(int trig) void orte_trigger_event(orte_trigger_event_t *trig)
{ {
int data=1; int data=1;
write(trig, &data, sizeof(int)); if (!opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */
return;
}
write(trig->channel, &data, sizeof(int));
opal_progress(); opal_progress();
} }
@ -1100,8 +1130,7 @@ orte_wait_cb_enable(void)
return ORTE_ERR_NOT_SUPPORTED; return ORTE_ERR_NOT_SUPPORTED;
} }
void void orte_trigger_event(orte_trigger_event_t *trig)
orte_trigger_event(int trig)
{ {
} }

Просмотреть файл

@ -38,15 +38,22 @@
#include "opal/dss/dss.h" #include "opal/dss/dss.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "orte/util/show_help.h" #include "opal/sys/atomic.h"
#include "opal/event/event.h" #include "opal/event/event.h"
#include "opal/runtime/opal_progress.h" #include "opal/runtime/opal_progress.h"
#include "orte/util/show_help.h"
#include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/rml_types.h"
#include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS BEGIN_C_DECLS
typedef struct {
opal_object_t super;
int channel;
opal_atomic_lock_t lock;
} orte_trigger_event_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_trigger_event_t);
/** typedef for callback function used in \c ompi_rte_wait_cb */ /** typedef for callback function used in \c ompi_rte_wait_cb */
typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data); typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data);
@ -99,17 +106,18 @@ ORTE_DECLSPEC int orte_wait_cb_enable(void);
/** /**
* Setup to wait for an event * Setup to wait for an event
* *
* This function is used to setup a pipe that can be used elsewhere * This function is used to setup a trigger event that can be used elsewhere
* in the code base where we want to wait for some event to * in the code base where we want to wait for some event to
* happen. For example, orterun uses this function to setup an event * happen. For example, orterun uses this function to setup an event
* that is used to notify orterun of abnormal and normal termination * that is used to notify orterun of abnormal and normal termination
* so it can wakeup and exit cleanly. * so it can wakeup and exit cleanly.
* *
* The event will be defined so that a write to the provided trigger * The event will be defined so that firing the provided trigger
* pipe will cause the event to trigger and callback to the provided * will cause the event to trigger and callback to the provided
* function * function
*/ */
ORTE_DECLSPEC int orte_wait_event(opal_event_t **event, int *trig, ORTE_DECLSPEC int orte_wait_event(opal_event_t **event,
orte_trigger_event_t *trig,
void (*cbfunc)(int, short, void*)); void (*cbfunc)(int, short, void*));
/** /**
@ -136,9 +144,9 @@ ORTE_DECLSPEC int orte_wait_event(opal_event_t **event, int *trig,
* Trigger a defined event * Trigger a defined event
* *
* This function will trigger a previously-defined event - as setup * This function will trigger a previously-defined event - as setup
* by orte_wait_event - by sending a message to the provided pipe * by orte_wait_event - by firing the provided trigger
*/ */
ORTE_DECLSPEC void orte_trigger_event(int trig); ORTE_DECLSPEC void orte_trigger_event(orte_trigger_event_t *trig);
/** /**
* Setup an event to process a message * Setup an event to process a message

Просмотреть файл

@ -1,49 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "orte/util/show_help.h"
#include "opal/threads/condition.h"
#include "opal/runtime/opal_progress.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_locks.h"
#include "orte/runtime/orte_wakeup.h"
int orte_wakeup(void)
{
/* set the exit status and trigger the
* exit procedure
*/
if (!opal_atomic_trylock(&orte_wakeup_lock)) { /* returns 1 if already locked */
return ORTE_SUCCESS;
}
orte_trigger_event(orte_exit);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,74 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Interface for forcibly waking up orterun.
*/
#ifndef ORTE_WAKEUP_H
#define ORTE_WAKEUP_H
#include "orte_config.h"
#include "orte/types.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
BEGIN_C_DECLS
/**
* Define a macro for updating the orte_exit_status
* The macro provides a convenient way of doing this
* so that we can add thread locking at some point
* since the orte_exit_status is a global variable.
*
* Ensure that we do not overwrite the exit status if it has
* already been set to some non-zero value. If we don't make
* this check, then different parts of the code could overwrite
* each other's exit status in the case of abnormal termination.
*
* For example, if a process aborts, we would record the initial
* exit code from the aborted process. However, subsequent processes
* will have been aborted by signal as we kill the job. We don't want
* the subsequent processes to overwrite the original exit code so
* we can tell the user the exit code from the process that caused
* the whole thing to happen.
*/
#define ORTE_UPDATE_EXIT_STATUS(newstatus) \
do { \
if (0 == orte_exit_status && 0 != newstatus) { \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"%s:%s(%d) updating exit status to %d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, newstatus)); \
orte_exit_status = newstatus; \
} \
} while(0);
/**
* Wakeup orterun by reporting the termination of all processes
*/
ORTE_DECLSPEC int orte_wakeup(void);
END_C_DECLS
#endif /* #ifndef ORTE_WAKEUP_H */

Просмотреть файл

@ -82,7 +82,6 @@
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_data_server.h" #include "orte/runtime/orte_data_server.h"
#include "orte/runtime/orte_locks.h" #include "orte/runtime/orte_locks.h"
@ -317,6 +316,7 @@ static opal_cmd_line_init_t cmd_line_init[] = {
*/ */
static void job_completed(int trigpipe, short event, void *arg); static void job_completed(int trigpipe, short event, void *arg);
static void terminated(int trigpipe, short event, void *arg); static void terminated(int trigpipe, short event, void *arg);
static void timeout_callback(int fd, short ign, void *arg);
static void abort_signal_callback(int fd, short flags, void *arg); static void abort_signal_callback(int fd, short flags, void *arg);
static void abort_exit_callback(int fd, short event, void *arg); static void abort_exit_callback(int fd, short event, void *arg);
static void signal_forward_callback(int fd, short event, void *arg); static void signal_forward_callback(int fd, short event, void *arg);
@ -625,21 +625,14 @@ static void job_completed(int trigpipe, short event, void *arg)
orte_job_state_t exit_state; orte_job_state_t exit_state;
orte_job_t *daemons; orte_job_t *daemons;
/* flag that we are here to avoid doing it twice */
if (!opal_atomic_trylock(&orte_job_complete_lock)) { /* returns 1 if already locked */
return;
}
/* if the abort exit event is set, delete it */ /* if the abort exit event is set, delete it */
if (NULL != abort_exit_event) { if (NULL != abort_exit_event) {
opal_evtimer_del(abort_exit_event); opal_evtimer_del(abort_exit_event);
free(abort_exit_event); free(abort_exit_event);
} }
/* close the trigger pipe */ /* cleanup the trigger */
if (0 <= trigpipe) { OBJ_DESTRUCT(&orte_exit);
close(trigpipe);
}
exit_state = jdata->state; exit_state = jdata->state;
@ -690,7 +683,7 @@ static void job_completed(int trigpipe, short event, void *arg)
} }
ORTE_DETECT_TIMEOUT(&timeout_ev, daemons->num_procs, ORTE_DETECT_TIMEOUT(&timeout_ev, daemons->num_procs,
orte_timeout_usec_per_proc, orte_timeout_usec_per_proc,
orte_max_timeout, terminated); orte_max_timeout, timeout_callback);
} }
/* now wait to hear it has been done */ /* now wait to hear it has been done */
@ -718,15 +711,8 @@ static void terminated(int trigpipe, short event, void *arg)
orte_proc_t **procs; orte_proc_t **procs;
orte_vpid_t i; orte_vpid_t i;
/* flag that we are here to avoid doing it twice */ /* cleanup the trigger */
if (!opal_atomic_trylock(&orte_terminate_lock)) { /* returns 1 if already locked */ OBJ_DESTRUCT(&orteds_exit);
return;
}
/* close the trigger pipe so it cannot be called again */
if (0 <= trigpipe) {
close(trigpipe);
}
/* clear the event timer */ /* clear the event timer */
if (NULL != timeout_ev) { if (NULL != timeout_ev) {
@ -948,10 +934,10 @@ static void dump_aborted_procs(void)
static void timeout_callback(int fd, short ign, void *arg) static void timeout_callback(int fd, short ign, void *arg)
{ {
/* just call terminated so we don't loop back into /* fire the trigger that takes us to terminated so we don't
* trying to kill things * loop back into trying to kill things
*/ */
terminated(-1, 0, NULL); orte_trigger_event(&orteds_exit);
} }
static void abort_exit_callback(int fd, short ign, void *arg) static void abort_exit_callback(int fd, short ign, void *arg)
@ -984,16 +970,13 @@ static void abort_exit_callback(int fd, short ign, void *arg)
* need to explicitly wake ourselves up to exit * need to explicitly wake ourselves up to exit
*/ */
ORTE_UPDATE_EXIT_STATUS(ret); ORTE_UPDATE_EXIT_STATUS(ret);
orte_wakeup(); orte_trigger_event(&orte_exit);
} }
/* give ourselves a time limit on how long to wait /* give ourselves a time limit on how long to wait
* for the job to die, just in case we can't make it go * for the job to die, just in case we can't make it go
* away for some reason. Don't send us directly back * away for some reason. Don't send us directly back
* to job_completed, though, as that function expects * to job_completed, though, as that function may be
* to be triggered via orte_wakeup - we could get into * what has failed
* race conditions, and the timeout won't provide
* that function with the orte_exit pipe fd so it can
* be closed
*/ */
ORTE_DETECT_TIMEOUT(&abort_exit_event, jdata->num_procs, ORTE_DETECT_TIMEOUT(&abort_exit_event, jdata->num_procs,
orte_timeout_usec_per_proc, orte_timeout_usec_per_proc,