Apply some suggestions from Ralph and avoid a pretty nasty race condition on the close of the fd.
The problem was that we close the same fd twice, and that meantime the fd could have been reassigned to some other file or socket. This commit was SVN r19869.
Этот коммит содержится в:
родитель
9f17d1d67d
Коммит
ebe87d1842
@ -102,9 +102,11 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
"%s iof:hnp:read handler %s Error on connection:%d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&rev->name), fd));
|
||||
close(fd);
|
||||
opal_event_del(&rev->ev);
|
||||
goto CLEAN_RETURN;
|
||||
/* Un-recoverable error. Allow the code to flow as usual in order to
|
||||
* to send the zero bytes message up the stream, and then close the
|
||||
* file descriptor and delete the event.
|
||||
*/
|
||||
numbytes = 0;
|
||||
}
|
||||
|
||||
/* is this read from our stdin? */
|
||||
@ -157,10 +159,6 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &sink->name, ORTE_IOF_STDIN, data, numbytes);
|
||||
}
|
||||
}
|
||||
/* check if stdin was closed */
|
||||
if (0 == numbytes) {
|
||||
opal_event_del(&rev->ev);
|
||||
}
|
||||
/* nothing more to do */
|
||||
goto CLEAN_RETURN;
|
||||
}
|
||||
@ -186,9 +184,6 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&sink->daemon)));
|
||||
orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &rev->name, rev->tag, data, numbytes);
|
||||
if (0 == numbytes) {
|
||||
opal_event_del(&rev->ev);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -198,16 +193,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
(ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"),
|
||||
ORTE_NAME_PRINT(&rev->name)));
|
||||
|
||||
/* if we read 0 bytes from the stdout/err/diag, there is
|
||||
* nothing to output - we do not close these file descriptors,
|
||||
* but we do terminate the event
|
||||
*/
|
||||
if (0 == numbytes) {
|
||||
opal_event_del(&rev->ev);
|
||||
goto CLEAN_RETURN;
|
||||
}
|
||||
|
||||
if (ORTE_IOF_STDOUT & rev->tag) {
|
||||
if ( (0 != numbytes) && (ORTE_IOF_STDOUT & rev->tag) ) {
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stdout);
|
||||
} else {
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, &orte_iof_base.iof_write_stderr);
|
||||
@ -215,6 +201,16 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
}
|
||||
|
||||
CLEAN_RETURN:
|
||||
/* if we read 0 bytes from the stdout/err/diag, there is
|
||||
* nothing to output - we do not close these file descriptors,
|
||||
* but we do terminate the event
|
||||
*/
|
||||
if (0 == numbytes) {
|
||||
close(fd);
|
||||
opal_event_del(&rev->ev);
|
||||
rev->ev.ev_fd = -1;
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock);
|
||||
|
||||
/* since the event is persistent, we do not need to re-add it */
|
||||
|
@ -163,12 +163,13 @@ static int orted_close(const orte_process_name_t* peer,
|
||||
OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
|
||||
|
||||
/* The STDIN have a read event attached, while everything else
|
||||
* have a sink. Therefore, we don't have to do anything special
|
||||
* for them, the sink will empty the output queue.
|
||||
* have a sink. We don't have to do anything special for sinks,
|
||||
* they will dissapear when the output queue is empty.
|
||||
*/
|
||||
if( ORTE_IOF_STDIN == source_tag ) {
|
||||
if( ORTE_IOF_STDIN & source_tag ) {
|
||||
opal_list_item_t *item, *next_item;
|
||||
orte_iof_read_event_t* rev;
|
||||
int rev_fd;
|
||||
|
||||
for( item = opal_list_get_first(&mca_iof_orted_component.read_events);
|
||||
item != opal_list_get_end(&mca_iof_orted_component.read_events);
|
||||
@ -182,8 +183,9 @@ static int orted_close(const orte_process_name_t* peer,
|
||||
/* No need to delete the event, the destructor will automatically
|
||||
* do it for us.
|
||||
*/
|
||||
close(rev->ev.ev_fd);
|
||||
rev_fd = rev->ev.ev_fd;
|
||||
OBJ_RELEASE(item);
|
||||
close(rev_fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -75,15 +75,10 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
#endif /* !defined(__WINDOWS__) */
|
||||
|
||||
if (numbytes <= 0) {
|
||||
if (0 == numbytes) {
|
||||
/* child process closed connection - close the fd */
|
||||
close(fd);
|
||||
goto CLEAN_RETURN;
|
||||
}
|
||||
if (0 > numbytes) {
|
||||
/* either we have a connection error or it was a non-blocking read */
|
||||
|
||||
/* non-blocking, retry */
|
||||
if (EAGAIN == errno || EINTR == errno) {
|
||||
/* non-blocking, retry */
|
||||
OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock);
|
||||
return;
|
||||
}
|
||||
@ -92,8 +87,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
"%s iof:orted:read handler %s Error on connection:%d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&rev->name), fd));
|
||||
|
||||
close(fd);
|
||||
}
|
||||
goto CLEAN_RETURN;
|
||||
}
|
||||
|
||||
@ -142,6 +136,8 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
CLEAN_RETURN:
|
||||
/* delete the event from the event library */
|
||||
opal_event_del(&rev->ev);
|
||||
close(rev->ev.ev_fd);
|
||||
rev->ev.ev_fd = -1;
|
||||
if (NULL != buf) {
|
||||
OBJ_RELEASE(buf);
|
||||
}
|
||||
|
@ -1853,10 +1853,8 @@ MOVEON:
|
||||
child->alive = false;
|
||||
|
||||
/* Release the IOF resources related to this child */
|
||||
orte_iof.close(child->name, ORTE_IOF_STDIN);
|
||||
orte_iof.close(child->name, ORTE_IOF_STDOUT);
|
||||
orte_iof.close(child->name, ORTE_IOF_STDERR);
|
||||
orte_iof.close(child->name, ORTE_IOF_STDDIAG);
|
||||
orte_iof.close(child->name, (ORTE_IOF_STDIN | ORTE_IOF_STDOUT |
|
||||
ORTE_IOF_STDERR | ORTE_IOF_STDDIAG) );
|
||||
|
||||
/* Clean up the session directory as if we were the process
|
||||
* itself. This covers the case where the process died abnormally
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user