* first take at a cntl-c handler for mpirun. This does not do everything
that we want, but will do a reasonable job at cleaning up the job if SIGINT (cntl-c) or SIGTERM are received between spawning of processes and the death of all the processes. If you see strange errors out of mpirun, please let me know. I'm sure there are a couple race conditions. I'm going to clean the code up tonight to try to reduce some of them. This commit was SVN r3817.
Этот коммит содержится в:
родитель
f6b8ac67ee
Коммит
1901882225
@ -474,6 +474,7 @@ internal_spawn_proc(mca_pcm_rsh_module_t *me,
|
||||
goto cleanup;
|
||||
} else if (pid == 0) {
|
||||
/* child */
|
||||
setpgid(0, 0);
|
||||
|
||||
if ((dup2(kidstdin[0], 0) < 0)) {
|
||||
/* BWB - XXX - FIX ME to use show help */
|
||||
@ -498,6 +499,7 @@ internal_spawn_proc(mca_pcm_rsh_module_t *me,
|
||||
} else {
|
||||
int comm_fd;
|
||||
|
||||
setpgid(pid, 0);
|
||||
/* parent */
|
||||
close(kidstdin[0]);
|
||||
|
||||
|
@ -50,6 +50,38 @@
|
||||
|
||||
extern char** environ;
|
||||
|
||||
struct ompi_event term_handler;
|
||||
struct ompi_event int_handler;
|
||||
struct ompi_event exit_handler;
|
||||
mca_ns_base_jobid_t new_jobid = MCA_NS_BASE_JOBID_MAX;
|
||||
|
||||
static void
|
||||
exit_callback(int fd, short event, void *arg)
|
||||
{
|
||||
printf("we failed to exit cleanly :(\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static void
|
||||
signal_callback(int fd, short event, void *arg)
|
||||
{
|
||||
int ret;
|
||||
struct timeval tv;
|
||||
|
||||
if (new_jobid != MCA_NS_BASE_JOBID_MAX) {
|
||||
ret = ompi_rte_terminate_job(new_jobid, 0);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
new_jobid = MCA_NS_BASE_JOBID_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
tv.tv_sec = 3;
|
||||
tv.tv_usec = 0;
|
||||
ompi_evtimer_set(&exit_handler, exit_callback, NULL);
|
||||
ompi_evtimer_add(&exit_handler, &tv);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
@ -60,7 +92,6 @@ main(int argc, char *argv[])
|
||||
ompi_cmd_line_t *cmd_line = NULL;
|
||||
ompi_list_t *nodelist = NULL;
|
||||
ompi_list_t schedlist;
|
||||
mca_ns_base_jobid_t new_jobid;
|
||||
int num_procs = 1;
|
||||
ompi_rte_node_schedule_t *sched;
|
||||
char cwd[MAXPATHLEN];
|
||||
@ -71,6 +102,8 @@ main(int argc, char *argv[])
|
||||
ompi_rte_process_status_t *proc_status;
|
||||
ompi_list_t *status_list;
|
||||
ompi_registry_value_t *value;
|
||||
|
||||
|
||||
/*
|
||||
* Intialize our Open MPI environment
|
||||
*/
|
||||
@ -225,6 +258,12 @@ main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/***** PREP TO START THE APPLICATION *****/
|
||||
ompi_event_set(&term_handler, SIGTERM, OMPI_EV_SIGNAL,
|
||||
signal_callback, NULL);
|
||||
ompi_event_add(&term_handler, NULL);
|
||||
ompi_event_set(&int_handler, SIGINT, OMPI_EV_SIGNAL,
|
||||
signal_callback, NULL);
|
||||
ompi_event_add(&int_handler, NULL);
|
||||
|
||||
/* get the jobid for the application */
|
||||
new_jobid = ompi_name_server.create_jobid();
|
||||
@ -332,9 +371,10 @@ main(int argc, char *argv[])
|
||||
ompi_rte_job_startup(new_jobid);
|
||||
ompi_rte_monitor_procs_unregistered();
|
||||
}
|
||||
/*
|
||||
* - ompi_rte_kill_job()
|
||||
*/
|
||||
|
||||
/* remove signal handler */
|
||||
ompi_event_del(&term_handler);
|
||||
ompi_event_del(&int_handler);
|
||||
|
||||
/*
|
||||
* Determine if the processes all exited normally - if not, flag the output of mpirun
|
||||
@ -367,11 +407,12 @@ main(int argc, char *argv[])
|
||||
unlink(filenm);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&schedlist);
|
||||
|
||||
ompi_rte_finalize();
|
||||
mca_base_close();
|
||||
ompi_finalize();
|
||||
|
||||
OBJ_DESTRUCT(&schedlist);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user