From e5babf830ade47119b8800869c0aa2a8c4a6ea08 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 25 Aug 2012 11:39:06 +0000 Subject: [PATCH] Fixes trac:3258: add btl_openib_abort_not_enough_reg_mem MCA parameter that causes MPI jobs to abort if there is not enough registered memory available (vs. just warning). This commit was SVN r27140. The following Trac tickets were found above: Ticket 3258 --> https://svn.open-mpi.org/trac/ompi/ticket/3258 --- ompi/mca/btl/openib/btl_openib.c | 23 +++++++++++++++++---- ompi/mca/btl/openib/btl_openib.h | 4 ++++ ompi/mca/btl/openib/btl_openib_mca.c | 8 +++++++ ompi/mca/btl/openib/help-mpi-btl-openib.txt | 2 ++ 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 81b3f324fe..d7606b8843 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -31,6 +31,7 @@ #endif #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/base/base.h" #include "opal/class/opal_bitmap.h" #include "opal/util/output.h" #include "opal/util/arch.h" @@ -650,18 +651,32 @@ static uint64_t calculate_max_reg (void) max_reg = (num_mtt - reserved_mtt) * getpagesize () * mtts_per_seg; } else { - /* need to update to determine the registration limit for this configuration */ + /* Need to update to determine the registration limit for this + configuration */ max_reg = mem_total; } - /* NTH: print a warning if we can't register more than 75% of physical memory */ + /* Print a warning if we can't register more than 75% of physical + memory. Abort if the abort_not_enough_reg_mem MCA param was + set. */ if (max_reg < mem_total * 3 / 4) { + char *action; + + if (mca_btl_openib_component.abort_not_enough_reg_mem) { + action = "Your MPI job will now abort."; + } else { + action = "Your MPI job will continue, but may be behave poorly and/or hang."; + } orte_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true, orte_process_info.nodename, (unsigned long)(max_reg >> 20), - (unsigned long)(mem_total >> 20)); + (unsigned long)(mem_total >> 20), action); + if (mca_btl_openib_component.abort_not_enough_reg_mem) { + orte_errmgr.abort(1, NULL); + } } - /* limit us to 87.5% of the registered memory (some fluff for QPs, file systems, etc) */ + /* Limit us to 87.5% of the registered memory (some fluff for QPs, + file systems, etc) */ return (max_reg * 7) >> 3; } diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index d2ca387d10..baa5ecf87c 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -267,6 +267,10 @@ struct mca_btl_openib_component_t { /** Whether we want a warning if the user specifies a non-existent device and/or port via btl_openib_if_[in|ex]clude MCA params */ bool warn_nonexistent_if; + /** Whether we want to abort if there's not enough registered + memory available */ + bool abort_not_enough_reg_mem; + /** Dummy argv-style list; a copy of names from the if_[in|ex]clude list that we use for error checking (to ensure that they all exist) */ diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index f84b41c105..8c10aba39e 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -166,6 +166,14 @@ int btl_openib_register_mca_params(void) 1, &ival, 0)); mca_btl_openib_component.warn_nonexistent_if = (0 != ival); + /* If we print a warning about not having enough registered memory + available, do we want to abort? */ + CHECK(reg_int("abort_not_enough_reg_mem", NULL, + "If there is not enough registered memory available on the system for Open MPI to function properly, Open MPI will issue a warning. If this MCA parameter is set to true, then Open MPI will also abort all MPI jobs " + "(0 = warn, but do not abort; any other value = warn and abort)", + 0, &ival, 0)); + mca_btl_openib_component.abort_not_enough_reg_mem = (0 != ival); + if (OMPI_HAVE_IBV_FORK_INIT) { ival2 = -1; } else { diff --git a/ompi/mca/btl/openib/help-mpi-btl-openib.txt b/ompi/mca/btl/openib/help-mpi-btl-openib.txt index db137ab084..5245b2b859 100644 --- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt +++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt @@ -708,3 +708,5 @@ parameters: Local host: %s Registerable memory: %lu MiB Total memory: %lu MiB + +%s