1
1

Fixes trac:3258: add btl_openib_abort_not_enough_reg_mem MCA parameter

that causes MPI jobs to abort if there is not enough registered memory
available (vs. just warning).

This commit was SVN r27140.

The following Trac tickets were found above:
  Ticket 3258 --> https://svn.open-mpi.org/trac/ompi/ticket/3258
Этот коммит содержится в:
Jeff Squyres 2012-08-25 11:39:06 +00:00
родитель 0e1dbe8711
Коммит e5babf830a
4 изменённых файлов: 33 добавлений и 4 удалений

Просмотреть файл

@ -31,6 +31,7 @@
#endif
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/base/base.h"
#include "opal/class/opal_bitmap.h"
#include "opal/util/output.h"
#include "opal/util/arch.h"
@ -650,18 +651,32 @@ static uint64_t calculate_max_reg (void)
max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg;
} else {
/* need to update to determine the registration limit for this configuration */
/* Need to update to determine the registration limit for this
configuration */
max_reg = mem_total;
}
/* NTH: print a warning if we can't register more than 75% of physical memory */
/* Print a warning if we can't register more than 75% of physical
memory. Abort if the abort_not_enough_reg_mem MCA param was
set. */
if (max_reg < mem_total * 3 / 4) {
char *action;
if (mca_btl_openib_component.abort_not_enough_reg_mem) {
action = "Your MPI job will now abort.";
} else {
action = "Your MPI job will continue, but may be behave poorly and/or hang.";
}
orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
orte_process_info.nodename, (unsigned long)(max_reg >> 20),
(unsigned long)(mem_total >> 20));
(unsigned long)(mem_total >> 20), action);
if (mca_btl_openib_component.abort_not_enough_reg_mem) {
orte_errmgr.abort(1, NULL);
}
}
/* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */
/* Limit us to 87.5% of the registered memory (some fluff for QPs,
file systems, etc) */
return (max_reg * 7) >> 3;
}

Просмотреть файл

@ -267,6 +267,10 @@ struct mca_btl_openib_component_t {
/** Whether we want a warning if the user specifies a non-existent
device and/or port via btl_openib_if_[in|ex]clude MCA params */
bool warn_nonexistent_if;
/** Whether we want to abort if there's not enough registered
memory available */
bool abort_not_enough_reg_mem;
/** Dummy argv-style list; a copy of names from the
if_[in|ex]clude list that we use for error checking (to ensure
that they all exist) */

Просмотреть файл

@ -166,6 +166,14 @@ int btl_openib_register_mca_params(void)
1, &ival, 0));
mca_btl_openib_component.warn_nonexistent_if = (0 != ival);
/* If we print a warning about not having enough registered memory
available, do we want to abort? */
CHECK(reg_int("abort_not_enough_reg_mem", NULL,
"If there is not enough registered memory available on the system for Open MPI to function properly, Open MPI will issue a warning. If this MCA parameter is set to true, then Open MPI will also abort all MPI jobs "
"(0 = warn, but do not abort; any other value = warn and abort)",
0, &ival, 0));
mca_btl_openib_component.abort_not_enough_reg_mem = (0 != ival);
if (OMPI_HAVE_IBV_FORK_INIT) {
ival2 = -1;
} else {

Просмотреть файл

@ -708,3 +708,5 @@ parameters:
Local host: %s
Registerable memory: %lu MiB
Total memory: %lu MiB
%s