* first take at a cntl-c handler for mpirun. This does not do everything
that we want, but will do a reasonable job at cleaning up the job if SIGINT (cntl-c) or SIGTERM are received between spawning of processes and the death of all the processes. If you see strange errors out of mpirun, please let me know. I'm sure there are a couple race conditions. I'm going to clean the code up tonight to try to reduce some of them. This commit was SVN r3817.
Этот коммит содержится в:
родитель
f6b8ac67ee
Коммит
1901882225
@ -474,6 +474,7 @@ internal_spawn_proc(mca_pcm_rsh_module_t *me,
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
} else if (pid == 0) {
|
} else if (pid == 0) {
|
||||||
/* child */
|
/* child */
|
||||||
|
setpgid(0, 0);
|
||||||
|
|
||||||
if ((dup2(kidstdin[0], 0) < 0)) {
|
if ((dup2(kidstdin[0], 0) < 0)) {
|
||||||
/* BWB - XXX - FIX ME to use show help */
|
/* BWB - XXX - FIX ME to use show help */
|
||||||
@ -498,6 +499,7 @@ internal_spawn_proc(mca_pcm_rsh_module_t *me,
|
|||||||
} else {
|
} else {
|
||||||
int comm_fd;
|
int comm_fd;
|
||||||
|
|
||||||
|
setpgid(pid, 0);
|
||||||
/* parent */
|
/* parent */
|
||||||
close(kidstdin[0]);
|
close(kidstdin[0]);
|
||||||
|
|
||||||
|
@ -50,6 +50,38 @@
|
|||||||
|
|
||||||
extern char** environ;
|
extern char** environ;
|
||||||
|
|
||||||
|
struct ompi_event term_handler;
|
||||||
|
struct ompi_event int_handler;
|
||||||
|
struct ompi_event exit_handler;
|
||||||
|
mca_ns_base_jobid_t new_jobid = MCA_NS_BASE_JOBID_MAX;
|
||||||
|
|
||||||
|
static void
|
||||||
|
exit_callback(int fd, short event, void *arg)
|
||||||
|
{
|
||||||
|
printf("we failed to exit cleanly :(\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
signal_callback(int fd, short event, void *arg)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
struct timeval tv;
|
||||||
|
|
||||||
|
if (new_jobid != MCA_NS_BASE_JOBID_MAX) {
|
||||||
|
ret = ompi_rte_terminate_job(new_jobid, 0);
|
||||||
|
if (OMPI_SUCCESS != ret) {
|
||||||
|
new_jobid = MCA_NS_BASE_JOBID_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tv.tv_sec = 3;
|
||||||
|
tv.tv_usec = 0;
|
||||||
|
ompi_evtimer_set(&exit_handler, exit_callback, NULL);
|
||||||
|
ompi_evtimer_add(&exit_handler, &tv);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
@ -60,7 +92,6 @@ main(int argc, char *argv[])
|
|||||||
ompi_cmd_line_t *cmd_line = NULL;
|
ompi_cmd_line_t *cmd_line = NULL;
|
||||||
ompi_list_t *nodelist = NULL;
|
ompi_list_t *nodelist = NULL;
|
||||||
ompi_list_t schedlist;
|
ompi_list_t schedlist;
|
||||||
mca_ns_base_jobid_t new_jobid;
|
|
||||||
int num_procs = 1;
|
int num_procs = 1;
|
||||||
ompi_rte_node_schedule_t *sched;
|
ompi_rte_node_schedule_t *sched;
|
||||||
char cwd[MAXPATHLEN];
|
char cwd[MAXPATHLEN];
|
||||||
@ -71,6 +102,8 @@ main(int argc, char *argv[])
|
|||||||
ompi_rte_process_status_t *proc_status;
|
ompi_rte_process_status_t *proc_status;
|
||||||
ompi_list_t *status_list;
|
ompi_list_t *status_list;
|
||||||
ompi_registry_value_t *value;
|
ompi_registry_value_t *value;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Intialize our Open MPI environment
|
* Intialize our Open MPI environment
|
||||||
*/
|
*/
|
||||||
@ -225,6 +258,12 @@ main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
/***** PREP TO START THE APPLICATION *****/
|
/***** PREP TO START THE APPLICATION *****/
|
||||||
|
ompi_event_set(&term_handler, SIGTERM, OMPI_EV_SIGNAL,
|
||||||
|
signal_callback, NULL);
|
||||||
|
ompi_event_add(&term_handler, NULL);
|
||||||
|
ompi_event_set(&int_handler, SIGINT, OMPI_EV_SIGNAL,
|
||||||
|
signal_callback, NULL);
|
||||||
|
ompi_event_add(&int_handler, NULL);
|
||||||
|
|
||||||
/* get the jobid for the application */
|
/* get the jobid for the application */
|
||||||
new_jobid = ompi_name_server.create_jobid();
|
new_jobid = ompi_name_server.create_jobid();
|
||||||
@ -332,9 +371,10 @@ main(int argc, char *argv[])
|
|||||||
ompi_rte_job_startup(new_jobid);
|
ompi_rte_job_startup(new_jobid);
|
||||||
ompi_rte_monitor_procs_unregistered();
|
ompi_rte_monitor_procs_unregistered();
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
* - ompi_rte_kill_job()
|
/* remove signal handler */
|
||||||
*/
|
ompi_event_del(&term_handler);
|
||||||
|
ompi_event_del(&int_handler);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Determine if the processes all exited normally - if not, flag the output of mpirun
|
* Determine if the processes all exited normally - if not, flag the output of mpirun
|
||||||
@ -367,11 +407,12 @@ main(int argc, char *argv[])
|
|||||||
unlink(filenm);
|
unlink(filenm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OBJ_DESTRUCT(&schedlist);
|
||||||
|
|
||||||
ompi_rte_finalize();
|
ompi_rte_finalize();
|
||||||
mca_base_close();
|
mca_base_close();
|
||||||
ompi_finalize();
|
ompi_finalize();
|
||||||
|
|
||||||
OBJ_DESTRUCT(&schedlist);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user