From 35a86b3347b2de18d7a97790ba3dcb9117c9ef9f Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 4 Aug 2008 14:25:19 +0000 Subject: [PATCH] Establish an MCA param "orte_allocation_required" so that a system can require the user have an RM-provided allocation in order to run. This helps prevent the problem where a user forgets to get an allocation on an RM-managed cluster, and then executes mpirun on the head node - thus causing all of their mpi procs to launch on the head node, usually bringing it to its knees. Since OMPI allows mpirun to default to the local node, and since users want to retain the option to co-locate procs with mpirun, we needed another param to block this error case. This commit was SVN r19135. --- orte/mca/ras/Makefile.am | 1 + orte/mca/ras/base/Makefile.am | 2 ++ orte/mca/ras/base/help-ras-base.txt | 35 +++++++++++++++++++++++++++ orte/mca/ras/base/ras_base_allocate.c | 10 ++++++++ orte/runtime/orte_globals.c | 1 + orte/runtime/orte_globals.h | 1 + orte/runtime/orte_mca_params.c | 8 +++++- 7 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 orte/mca/ras/base/help-ras-base.txt diff --git a/orte/mca/ras/Makefile.am b/orte/mca/ras/Makefile.am index 24c7c4e16c..c025174b5f 100644 --- a/orte/mca/ras/Makefile.am +++ b/orte/mca/ras/Makefile.am @@ -22,6 +22,7 @@ libmca_ras_la_SOURCES = # header setup nobase_orte_HEADERS = +dist_pkgdata_DATA = # local files headers = ras.h ras_types.h diff --git a/orte/mca/ras/base/Makefile.am b/orte/mca/ras/base/Makefile.am index 548ef4dcc1..18a90323e5 100644 --- a/orte/mca/ras/base/Makefile.am +++ b/orte/mca/ras/base/Makefile.am @@ -16,6 +16,8 @@ # $HEADER$ # +dist_pkgdata_DATA += base/help-ras-base.txt + headers += \ base/base.h diff --git a/orte/mca/ras/base/help-ras-base.txt b/orte/mca/ras/base/help-ras-base.txt new file mode 100644 index 0000000000..70ed557105 --- /dev/null +++ b/orte/mca/ras/base/help-ras-base.txt @@ -0,0 +1,35 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for the RAS base. +# +[ras-base:no-allocation] +We were unable to find an allocation for this job as required by +setting the "allocation required" flag. Please ensure you have +the necessary allocation before executing again. + +If you wish to execute without a provided allocation (e.g., by +providing a user-specified hostfile), please ensure that the "allocation +required" flag is not set. This flag can be set in several forms, so +please check that none of the following exist: + +MCA param file: orte_allocation_required = 1 +Environment: OMPI_MCA_orte_allocation_required=1 +Cmd line: -mca orte_allocation_required 1 + + diff --git a/orte/mca/ras/base/ras_base_allocate.c b/orte/mca/ras/base/ras_base_allocate.c index 7efd0ed357..50460b2b7c 100644 --- a/orte/mca/ras/base/ras_base_allocate.c +++ b/orte/mca/ras/base/ras_base_allocate.c @@ -29,6 +29,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wakeup.h" #include "orte/util/hostfile/hostfile.h" #include "orte/util/dash_host/dash_host.h" #include "orte/util/proc_info.h" @@ -110,6 +111,15 @@ int orte_ras_base_allocate(orte_job_t *jdata) } OBJ_DESTRUCT(&nodes); goto DISPLAY; + } else if (orte_allocation_required) { + /* if nothing was found, and an allocation is + * required, then error out + */ + OBJ_DESTRUCT(&nodes); + orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + orte_wakeup(); + return ORTE_ERROR; } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 7c6e5acbfd..9003cf8064 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -64,6 +64,7 @@ char **orte_launch_environ; opal_pointer_array_t orte_daemonmap; bool orte_hnp_is_allocated = false; +bool orte_allocation_required; char *orte_launch_agent; char **orted_cmd_line=NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 054f6f7014..c05599dd16 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -349,6 +349,7 @@ ORTE_DECLSPEC extern char **orte_launch_environ; ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap; ORTE_DECLSPEC extern bool orte_hnp_is_allocated; +ORTE_DECLSPEC extern bool orte_allocation_required; ORTE_DECLSPEC extern char *orte_launch_agent; ORTE_DECLSPEC extern char **orted_cmd_line; diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 8803785361..2c250ac16e 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -164,7 +164,13 @@ int orte_register_params(void) mca_base_param_reg_string_name("orte", "launch_agent", "Command used to start processes on remote nodes (default: orted)", false, false, "orted", &orte_launch_agent); - + + /* whether or not to require RM allocation */ + mca_base_param_reg_int_name("orte", "allocation_required", + "Whether or not an allocation by a resource manager is required [default: no]", + false, false, (int)false, &value); + orte_allocation_required = OPAL_INT_TO_BOOL(value); + #endif /* ORTE_DISABLE_FULL_SUPPORT */ return ORTE_SUCCESS;