1
1

Close the file descriptors used to push or pull the data to the children.

Without this patch, doing spawn in a loop ended up by exhausting all
available file descriptors pretty quickly. There were about 5 file
descriptors opened per spawned process. Now the number of file
descriptors managed by the process (orted or HNP)
is a lot smaller.

This commit was SVN r19864.
Этот коммит содержится в:
George Bosilca 2008-10-31 18:05:28 +00:00
родитель a456c057d6
Коммит 0ce76248e8
8 изменённых файлов: 97 добавлений и 30 удалений

Просмотреть файл

@ -67,24 +67,28 @@
int int
orte_iof_base_setup_prefork(orte_iof_base_io_conf_t *opts) orte_iof_base_setup_prefork(orte_iof_base_io_conf_t *opts)
{ {
int ret; int ret = -1;
/* first check to make sure we can do ptys */
#if !OMPI_ENABLE_PTY_SUPPORT
opts->usepty = 0;
#endif
fflush(stdout); fflush(stdout);
/* first check to make sure we can do ptys */
#if OMPI_ENABLE_PTY_SUPPORT #if OMPI_ENABLE_PTY_SUPPORT
if (opts->usepty) { if (opts->usepty) {
/**
* It has been reported that on MAC OS X 10.4 and prior one cannot
* safely close the writing side of a pty before completly reading
* all data inside.
* There seems to be two issues: first all pending data is
* discarded, and second it randomly generate kernel panics.
* Apparently this issue was fixed in 10.5 so by now we use the
* pty exactly as we use the pipes.
* This comment is here as a reminder.
*/
ret = opal_openpty(&(opts->p_stdout[0]), &(opts->p_stdout[1]), ret = opal_openpty(&(opts->p_stdout[0]), &(opts->p_stdout[1]),
(char*)NULL, (struct termios*)NULL, (struct winsize*)NULL); (char*)NULL, (struct termios*)NULL, (struct winsize*)NULL);
} else {
ret = -1;
} }
#else #else
ret = -1; opts->usepty = 0;
#endif #endif
#if defined(__WINDOWS__) #if defined(__WINDOWS__)
@ -124,10 +128,8 @@ orte_iof_base_setup_child(orte_iof_base_io_conf_t *opts, char ***env)
int ret; int ret;
char *str; char *str;
if (!opts->usepty) {
close(opts->p_stdout[0]);
}
close(opts->p_stdin[1]); close(opts->p_stdin[1]);
close(opts->p_stdout[0]);
close(opts->p_stderr[0]); close(opts->p_stderr[0]);
close(opts->p_internal[0]); close(opts->p_internal[0]);
@ -203,10 +205,8 @@ orte_iof_base_setup_parent(const orte_process_name_t* name,
{ {
int ret; int ret;
if (! opts->usepty) {
close(opts->p_stdout[1]);
}
close(opts->p_stdin[0]); close(opts->p_stdin[0]);
close(opts->p_stdout[1]);
close(opts->p_stderr[1]); close(opts->p_stderr[1]);
close(opts->p_internal[1]); close(opts->p_internal[1]);

Просмотреть файл

@ -268,8 +268,36 @@ static int hnp_pull(const orte_process_name_t* dst_name,
* stream(s), thus terminating any potential io to/from it. * stream(s), thus terminating any potential io to/from it.
*/ */
static int hnp_close(const orte_process_name_t* peer, static int hnp_close(const orte_process_name_t* peer,
orte_iof_tag_t source_tag) orte_iof_tag_t source_tag)
{ {
opal_list_item_t *item, *next_item;
if( ORTE_IOF_STDIN == source_tag ) {
orte_iof_read_event_t* rev;
for( item = opal_list_get_first(&mca_iof_hnp_component.read_events);
item != opal_list_get_end(&mca_iof_hnp_component.read_events);
item = next_item ) {
rev = (orte_iof_read_event_t*)item;
next_item = opal_list_get_next(item);
if( (rev->name.jobid == peer->jobid) &&
(rev->name.vpid == peer->vpid) ) {
/* Dont close if it's the main stdin. This will get closed
* in component close.
*/
if( mca_iof_hnp_component.stdinev == rev ) continue;
opal_list_remove_item(&mca_iof_hnp_component.read_events,
item);
/* No need to delete the event, the destructor will automatically
* do it for us.
*/
close(rev->ev.ev_fd);
OBJ_RELEASE(item);
}
}
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -96,6 +96,8 @@ static int orte_iof_hnp_close(void)
OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock); OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock);
/* if the stdin event is active, delete it */ /* if the stdin event is active, delete it */
if (NULL != mca_iof_hnp_component.stdinev && mca_iof_hnp_component.stdinev->active) { if (NULL != mca_iof_hnp_component.stdinev && mca_iof_hnp_component.stdinev->active) {
/* this is being pedantic ... */
close(mca_iof_hnp_component.stdinev->ev.ev_fd);
opal_event_del(&(mca_iof_hnp_component.stdinev->ev)); opal_event_del(&(mca_iof_hnp_component.stdinev->ev));
} }
/* cleanout all registered sinks */ /* cleanout all registered sinks */

Просмотреть файл

@ -102,7 +102,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
"%s iof:hnp:read handler %s Error on connection:%d", "%s iof:hnp:read handler %s Error on connection:%d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&rev->name), fd)); ORTE_NAME_PRINT(&rev->name), fd));
close(fd);
opal_event_del(&rev->ev); opal_event_del(&rev->ev);
goto CLEAN_RETURN; goto CLEAN_RETURN;
} }

Просмотреть файл

@ -158,8 +158,38 @@ static int orted_pull(const orte_process_name_t* dst_name,
* For the orted, this just means closing the local fd * For the orted, this just means closing the local fd
*/ */
static int orted_close(const orte_process_name_t* peer, static int orted_close(const orte_process_name_t* peer,
orte_iof_tag_t source_tag) orte_iof_tag_t source_tag)
{ {
OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
/* The STDIN have a read event attached, while everything else
* have a sink. Therefore, we don't have to do anything special
* for them, the sink will empty the output queue.
*/
if( ORTE_IOF_STDIN == source_tag ) {
opal_list_item_t *item, *next_item;
orte_iof_read_event_t* rev;
for( item = opal_list_get_first(&mca_iof_orted_component.read_events);
item != opal_list_get_end(&mca_iof_orted_component.read_events);
item = next_item ) {
rev = (orte_iof_read_event_t*)item;
next_item = opal_list_get_next(item);
if( (rev->name.jobid == peer->jobid) &&
(rev->name.vpid == peer->vpid) ) {
opal_list_remove_item(&mca_iof_orted_component.read_events,
item);
/* No need to delete the event, the destructor will automatically
* do it for us.
*/
close(rev->ev.ev_fd);
OBJ_RELEASE(item);
}
}
}
OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -197,8 +227,6 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
close(wev->fd); close(wev->fd);
/* be sure to delete the write event */ /* be sure to delete the write event */
opal_event_del(&wev->ev); opal_event_del(&wev->ev);
/* set the fd to -1 to indicate that this channel is closed */
wev->fd = -1;
goto DEPART; goto DEPART;
} }
num_written = write(wev->fd, output->data, output->numbytes); num_written = write(wev->fd, output->data, output->numbytes);
@ -228,7 +256,9 @@ static void stdin_write_handler(int fd, short event, void *cbdata)
} }
OBJ_RELEASE(output); OBJ_RELEASE(output);
} }
goto CHECK; /* don't abort yet. Spurious event might happens */
ABORT: ABORT:
close(wev->fd);
opal_event_del(&wev->ev); opal_event_del(&wev->ev);
wev->pending = false; wev->pending = false;

Просмотреть файл

@ -74,7 +74,12 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
} }
#endif /* !defined(__WINDOWS__) */ #endif /* !defined(__WINDOWS__) */
if (numbytes < 0) { if (numbytes <= 0) {
if (0 == numbytes) {
/* child process closed connection - close the fd */
close(fd);
goto CLEAN_RETURN;
}
/* either we have a connection error or it was a non-blocking read */ /* either we have a connection error or it was a non-blocking read */
/* non-blocking, retry */ /* non-blocking, retry */
@ -88,10 +93,6 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&rev->name), fd)); ORTE_NAME_PRINT(&rev->name), fd));
goto CLEAN_RETURN;
} else if (0 == numbytes) {
/* child process closed connection - close the fd */
close(fd); close(fd);
goto CLEAN_RETURN; goto CLEAN_RETURN;
} }

Просмотреть файл

@ -1784,6 +1784,12 @@ MOVEON:
/* indicate the child is no longer alive */ /* indicate the child is no longer alive */
child->alive = false; child->alive = false;
/* Release the IOF resources related to this child */
orte_iof.close(child->name, ORTE_IOF_STDIN);
orte_iof.close(child->name, ORTE_IOF_STDOUT);
orte_iof.close(child->name, ORTE_IOF_STDERR);
orte_iof.close(child->name, ORTE_IOF_STDDIAG);
/* Clean up the session directory as if we were the process /* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally * itself. This covers the case where the process died abnormally
* and didn't cleanup its own session directory. * and didn't cleanup its own session directory.

Просмотреть файл

@ -265,14 +265,14 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
fdnull = open("/dev/null", O_RDONLY, 0); fdnull = open("/dev/null", O_RDONLY, 0);
if(fdnull > i) { if(fdnull > i) {
dup2(fdnull, i); dup2(fdnull, i);
close(fdnull);
} }
close(fdnull);
} }
fdnull = open("/dev/null", O_RDONLY, 0); fdnull = open("/dev/null", O_RDONLY, 0);
if(fdnull > opts.p_internal[1]) { if(fdnull > opts.p_internal[1]) {
dup2(fdnull, opts.p_internal[1]); dup2(fdnull, opts.p_internal[1]);
close(fdnull);
} }
close(fdnull);
} }
/* close all file descriptors w/ exception of /* close all file descriptors w/ exception of
@ -347,9 +347,8 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork got code %d back from child", "%s odls:default:fork got code %d back from child",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));
close(p[0]);
return ORTE_ERR_PIPE_READ_FAILURE; return ORTE_ERR_PIPE_READ_FAILURE;
break;
} else if (0 == rc) { } else if (0 == rc) {
/* Child was successful in exec'ing! */ /* Child was successful in exec'ing! */
break; break;
@ -370,7 +369,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"%s odls:default:fork got code %d back from child", "%s odls:default:fork got code %d back from child",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));
close(p[0]);
return i; return i;
} }
} }
@ -380,6 +379,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
child->state = ORTE_PROC_STATE_LAUNCHED; child->state = ORTE_PROC_STATE_LAUNCHED;
child->alive = true; child->alive = true;
} }
close(p[0]);
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;