1
1

Apply some suggestions from Ralph and avoid a pretty nasty race condition on the close of the fd.

The problem was that we close the same fd twice, and that meantime the fd could have been reassigned
to some other file or socket.

This commit was SVN r19869.
Этот коммит содержится в:
George Bosilca 2008-10-31 22:23:53 +00:00
родитель 9f17d1d67d
Коммит ebe87d1842
4 изменённых файлов: 39 добавлений и 47 удалений

Просмотреть файл

@ -102,9 +102,11 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
"%s iof:hnp:read handler %s Error on connection:%d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&rev->name), fd));
close(fd);
opal_event_del(&rev->ev);
goto CLEAN_RETURN;
/* Un-recoverable error. Allow the code to flow as usual in order to
* to send the zero bytes message up the stream, and then close the
* file descriptor and delete the event.
*/
numbytes = 0;
}
/* is this read from our stdin? */
@ -157,10 +159,6 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &sink->name, ORTE_IOF_STDIN, data, numbytes);
}
}
/* check if stdin was closed */
if (0 == numbytes) {
opal_event_del(&rev->ev);
}
/* nothing more to do */
goto CLEAN_RETURN;
}
@ -186,9 +184,6 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&sink->daemon)));
orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &rev->name, rev->tag, data, numbytes);
if (0 == numbytes) {
opal_event_del(&rev->ev);
}
}
}
@ -198,16 +193,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
(ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"),
ORTE_NAME_PRINT(&rev->name)));
/* if we read 0 bytes from the stdout/err/diag, there is
* nothing to output - we do not close these file descriptors,
* but we do terminate the event
*/
if (0 == numbytes) {
opal_event_del(&rev->ev);
goto CLEAN_RETURN;
}
if (ORTE_IOF_STDOUT & rev->tag) {
if ( (0 != numbytes) && (ORTE_IOF_STDOUT & rev->tag) ) {
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stdout);
} else {
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stderr);
@ -215,7 +201,17 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
}
CLEAN_RETURN:
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
/* if we read 0 bytes from the stdout/err/diag, there is
* nothing to output - we do not close these file descriptors,
* but we do terminate the event
*/
if (0 == numbytes) {
close(fd);
opal_event_del(&rev->ev);
rev->ev.ev_fd = -1;
}
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
/* since the event is persistent, we do not need to re-add it */
return;

Просмотреть файл

@ -163,12 +163,13 @@ static int orted_close(const orte_process_name_t* peer,
OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
/* The STDIN have a read event attached, while everything else
* have a sink. Therefore, we don't have to do anything special
* for them, the sink will empty the output queue.
* have a sink. We don't have to do anything special for sinks,
* they will dissapear when the output queue is empty.
*/
if( ORTE_IOF_STDIN == source_tag ) {
if( ORTE_IOF_STDIN & source_tag ) {
opal_list_item_t *item, *next_item;
orte_iof_read_event_t* rev;
int rev_fd;
for( item = opal_list_get_first(&mca_iof_orted_component.read_events);
item != opal_list_get_end(&mca_iof_orted_component.read_events);
@ -182,8 +183,9 @@ static int orted_close(const orte_process_name_t* peer,
/* No need to delete the event, the destructor will automatically
* do it for us.
*/
close(rev->ev.ev_fd);
rev_fd = rev->ev.ev_fd;
OBJ_RELEASE(item);
close(rev_fd);
}
}
}

Просмотреть файл

@ -75,25 +75,19 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
#endif /* !defined(__WINDOWS__) */
if (numbytes <= 0) {
if (0 == numbytes) {
/* child process closed connection - close the fd */
close(fd);
goto CLEAN_RETURN;
}
/* either we have a connection error or it was a non-blocking read */
/* non-blocking, retry */
if (EAGAIN == errno || EINTR == errno) {
OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock);
return;
}
if (0 > numbytes) {
/* either we have a connection error or it was a non-blocking read */
if (EAGAIN == errno || EINTR == errno) {
/* non-blocking, retry */
OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock);
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:orted:read handler %s Error on connection:%d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&rev->name), fd));
close(fd);
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:orted:read handler %s Error on connection:%d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&rev->name), fd));
}
goto CLEAN_RETURN;
}
@ -142,6 +136,8 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
CLEAN_RETURN:
/* delete the event from the event library */
opal_event_del(&rev->ev);
close(rev->ev.ev_fd);
rev->ev.ev_fd = -1;
if (NULL != buf) {
OBJ_RELEASE(buf);
}

Просмотреть файл

@ -1853,10 +1853,8 @@ MOVEON:
child->alive = false;
/* Release the IOF resources related to this child */
orte_iof.close(child->name, ORTE_IOF_STDIN);
orte_iof.close(child->name, ORTE_IOF_STDOUT);
orte_iof.close(child->name, ORTE_IOF_STDERR);
orte_iof.close(child->name, ORTE_IOF_STDDIAG);
orte_iof.close(child->name, (ORTE_IOF_STDIN | ORTE_IOF_STDOUT |
ORTE_IOF_STDERR | ORTE_IOF_STDDIAG) );
/* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally