1
1

Fix a C/R bug where the restart hung due to

dangling fds in the openib btl.

This commit was SVN r26094.
Этот коммит содержится в:
Abhishek Kulkarni 2012-03-04 06:57:33 +00:00
родитель c3cf46af65
Коммит 08ca0f80bc

Просмотреть файл

@ -682,13 +682,24 @@ int ompi_btl_openib_fd_finalize(void)
/* For the threaded version, send a command down the pipe */
cmd_t cmd;
OPAL_OUTPUT((-1, "shutting down openib fd"));
opal_event_del(&main_thread_event);
memset(&cmd, 0, cmd_size);
cmd.pc_cmd = CMD_TIME_TO_QUIT;
opal_fd_write(pipe_to_service_thread[1], cmd_size, &cmd);
/* Check if the thread exists before asking it to quit */
if (ESRCH != pthread_kill(thread, 0)) {
memset(&cmd, 0, cmd_size);
cmd.pc_cmd = CMD_TIME_TO_QUIT;
if (OPAL_SUCCESS != opal_fd_write(pipe_to_service_thread[1],
cmd_size, &cmd)) {
/* We cancel the thread if there's an error
* sending the "quit" cmd. This only ever happens on
* a "restart" which could result in dangling
* fds. OMPI must not rely on the checkpointer to
* save/restore any fds or connections
*/
pthread_cancel(thread);
}
pthread_join(thread, NULL);
opal_atomic_rmb();
pthread_join(thread, NULL);
opal_atomic_rmb();
}
opal_event_del(&main_thread_event);