Add the ability to set a backlog limit on forwarded output waiting at mpirun - helps to avoid crashing systems during debug. Note that we default to "unlimited" to maintain current behavior.
This commit was SVN r27479.
Этот коммит содержится в:
родитель
e6014bf2e1
Коммит
e06c330635
@ -123,6 +123,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_write_output_t);
|
||||
/* the iof globals struct */
|
||||
struct orte_iof_base_t {
|
||||
int iof_output;
|
||||
size_t output_limit;
|
||||
char *input_files;
|
||||
opal_list_t iof_components_opened;
|
||||
opal_mutex_t iof_write_output_lock;
|
||||
|
@ -205,6 +205,7 @@ int orte_iof_base_open(void)
|
||||
/* Initialize globals */
|
||||
OBJ_CONSTRUCT(&orte_iof_base.iof_components_opened, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_iof_base.iof_write_output_lock, opal_mutex_t);
|
||||
orte_iof_base.output_limit = UINT_MAX;
|
||||
|
||||
/* did the user request we print output to files? */
|
||||
if (NULL != orte_output_filename) {
|
||||
@ -224,6 +225,14 @@ int orte_iof_base_open(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* check for maximum number of pending output messages */
|
||||
mca_base_param_reg_int_name("iof", "base_output_limit",
|
||||
"Maximum backlog of output messages [default: unlimited]",
|
||||
false, false, -1, &rc);
|
||||
if (0 < rc) {
|
||||
orte_iof_base.output_limit = rc;
|
||||
}
|
||||
|
||||
/* check for files to be sent to stdin of procs */
|
||||
mca_base_param_reg_string_name("iof", "base_input_files",
|
||||
"Comma-separated list of input files to be read and sent to stdin of procs (default: NULL)",
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
|
||||
@ -296,6 +297,12 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
|
||||
if (EAGAIN == errno || EINTR == errno) {
|
||||
/* push this item back on the front of the list */
|
||||
opal_list_prepend(&wev->outputs, item);
|
||||
/* if the list is getting too large, abort */
|
||||
if (orte_iof_base.output_limit < opal_list_get_size(&wev->outputs)) {
|
||||
opal_output(0, "IO Forwarding is running too far behind - something is blocking us from writing");
|
||||
ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto ABORT;
|
||||
}
|
||||
/* leave the write event running so it will call us again
|
||||
* when the fd is ready.
|
||||
*/
|
||||
@ -311,6 +318,12 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
|
||||
memmove(output->data, &output->data[num_written], output->numbytes - num_written);
|
||||
/* push this item back on the front of the list */
|
||||
opal_list_prepend(&wev->outputs, item);
|
||||
/* if the list is getting too large, abort */
|
||||
if (orte_iof_base.output_limit < opal_list_get_size(&wev->outputs)) {
|
||||
opal_output(0, "IO Forwarding is running too far behind - something is blocking us from writing");
|
||||
ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
goto ABORT;
|
||||
}
|
||||
/* leave the write event running so it will call us again
|
||||
* when the fd is ready
|
||||
*/
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user