Fixes trac:3258: add btl_openib_abort_not_enough_reg_mem MCA parameter
that causes MPI jobs to abort if there is not enough registered memory available (vs. just warning). This commit was SVN r27140. The following Trac tickets were found above: Ticket 3258 --> https://svn.open-mpi.org/trac/ompi/ticket/3258
Этот коммит содержится в:
родитель
0e1dbe8711
Коммит
e5babf830a
@ -31,6 +31,7 @@
|
||||
#endif
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/arch.h"
|
||||
@ -650,18 +651,32 @@ static uint64_t calculate_max_reg (void)
|
||||
|
||||
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
|
||||
} else {
|
||||
/* need to update to determine the registration limit for this configuration */
|
||||
/* Need to update to determine the registration limit for this
|
||||
configuration */
|
||||
max_reg = mem_total;
|
||||
}
|
||||
|
||||
/* NTH: print a warning if we can't register more than 75% of physical memory */
|
||||
/* Print a warning if we can't register more than 75% of physical
|
||||
memory. Abort if the abort_not_enough_reg_mem MCA param was
|
||||
set. */
|
||||
if (max_reg < mem_total * 3 / 4) {
|
||||
char *action;
|
||||
|
||||
if (mca_btl_openib_component.abort_not_enough_reg_mem) {
|
||||
action = "Your MPI job will now abort.";
|
||||
} else {
|
||||
action = "Your MPI job will continue, but may be behave poorly and/or hang.";
|
||||
}
|
||||
orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
|
||||
orte_process_info.nodename, (unsigned long)(max_reg >> 20),
|
||||
(unsigned long)(mem_total >> 20));
|
||||
(unsigned long)(mem_total >> 20), action);
|
||||
if (mca_btl_openib_component.abort_not_enough_reg_mem) {
|
||||
orte_errmgr.abort(1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */
|
||||
/* Limit us to 87.5% of the registered memory (some fluff for QPs,
|
||||
file systems, etc) */
|
||||
return (max_reg * 7) >> 3;
|
||||
}
|
||||
|
||||
|
@ -267,6 +267,10 @@ struct mca_btl_openib_component_t {
|
||||
/** Whether we want a warning if the user specifies a non-existent
|
||||
device and/or port via btl_openib_if_[in|ex]clude MCA params */
|
||||
bool warn_nonexistent_if;
|
||||
/** Whether we want to abort if there's not enough registered
|
||||
memory available */
|
||||
bool abort_not_enough_reg_mem;
|
||||
|
||||
/** Dummy argv-style list; a copy of names from the
|
||||
if_[in|ex]clude list that we use for error checking (to ensure
|
||||
that they all exist) */
|
||||
|
@ -166,6 +166,14 @@ int btl_openib_register_mca_params(void)
|
||||
1, &ival, 0));
|
||||
mca_btl_openib_component.warn_nonexistent_if = (0 != ival);
|
||||
|
||||
/* If we print a warning about not having enough registered memory
|
||||
available, do we want to abort? */
|
||||
CHECK(reg_int("abort_not_enough_reg_mem", NULL,
|
||||
"If there is not enough registered memory available on the system for Open MPI to function properly, Open MPI will issue a warning. If this MCA parameter is set to true, then Open MPI will also abort all MPI jobs "
|
||||
"(0 = warn, but do not abort; any other value = warn and abort)",
|
||||
0, &ival, 0));
|
||||
mca_btl_openib_component.abort_not_enough_reg_mem = (0 != ival);
|
||||
|
||||
if (OMPI_HAVE_IBV_FORK_INIT) {
|
||||
ival2 = -1;
|
||||
} else {
|
||||
|
@ -708,3 +708,5 @@ parameters:
|
||||
Local host: %s
|
||||
Registerable memory: %lu MiB
|
||||
Total memory: %lu MiB
|
||||
|
||||
%s
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user