1
1

Add the ability to set a backlog limit on forwarded output waiting at mpirun - helps to avoid crashing systems during debug. Note that we default to "unlimited" to maintain current behavior.

This commit was SVN r27479.
Этот коммит содержится в:
Ralph Castain 2012-10-24 23:21:40 +00:00
родитель e6014bf2e1
Коммит e06c330635
3 изменённых файлов: 23 добавлений и 0 удалений

Просмотреть файл

@ -123,6 +123,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t);
/* the iof globals struct */
struct orte_iof_base_t {
int iof_output;
size_t output_limit;
char *input_files;
opal_list_t iof_components_opened;
opal_mutex_t iof_write_output_lock;

Просмотреть файл

@ -205,6 +205,7 @@ int orte_iof_base_open(void)
/* Initialize globals */
OBJ_CONSTRUCT(&orte_iof_base.iof_components_opened, opal_list_t);
OBJ_CONSTRUCT(&orte_iof_base.iof_write_output_lock, opal_mutex_t);
orte_iof_base.output_limit = UINT_MAX;
/* did the user request we print output to files? */
if (NULL != orte_output_filename) {
@ -224,6 +225,14 @@ int orte_iof_base_open(void)
}
}
/* check for maximum number of pending output messages */
mca_base_param_reg_int_name("iof", "base_output_limit",
"Maximum backlog of output messages [default: unlimited]",
false, false, -1, &rc);
if (0 < rc) {
orte_iof_base.output_limit = rc;
}
/* check for files to be sent to stdin of procs */
mca_base_param_reg_string_name("iof", "base_input_files",
"Comma-separated list of input files to be read and sent to stdin of procs (default: NULL)",

Просмотреть файл

@ -43,6 +43,7 @@
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
#include "orte/mca/iof/base/base.h"
@ -296,6 +297,12 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
if (EAGAIN == errno || EINTR == errno) {
/* push this item back on the front of the list */
opal_list_prepend(&wev->outputs, item);
/* if the list is getting too large, abort */
if (orte_iof_base.output_limit < opal_list_get_size(&wev->outputs)) {
opal_output(0, "IO Forwarding is running too far behind - something is blocking us from writing");
ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto ABORT;
}
/* leave the write event running so it will call us again
* when the fd is ready.
*/
@ -311,6 +318,12 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
memmove(output->data, &output->data[num_written], output->numbytes - num_written);
/* push this item back on the front of the list */
opal_list_prepend(&wev->outputs, item);
/* if the list is getting too large, abort */
if (orte_iof_base.output_limit < opal_list_get_size(&wev->outputs)) {
opal_output(0, "IO Forwarding is running too far behind - something is blocking us from writing");
ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto ABORT;
}
/* leave the write event running so it will call us again
* when the fd is ready
*/