1
1

Establish an MCA param "orte_allocation_required" so that a system can require the user have an RM-provided allocation in order to run. This helps prevent the problem where a user forgets to get an allocation on an RM-managed cluster, and then executes mpirun on the head node - thus causing all of their mpi procs to launch on the head node, usually bringing it to its knees.

Since OMPI allows mpirun to default to the local node, and since users want to retain the option to co-locate procs with mpirun, we needed another param to block this error case.

This commit was SVN r19135.
Этот коммит содержится в:
Ralph Castain 2008-08-04 14:25:19 +00:00
родитель 0d08866786
Коммит 35a86b3347
7 изменённых файлов: 57 добавлений и 1 удалений

Просмотреть файл

@ -22,6 +22,7 @@ libmca_ras_la_SOURCES =
# header setup # header setup
nobase_orte_HEADERS = nobase_orte_HEADERS =
dist_pkgdata_DATA =
# local files # local files
headers = ras.h ras_types.h headers = ras.h ras_types.h

Просмотреть файл

@ -16,6 +16,8 @@
# $HEADER$ # $HEADER$
# #
dist_pkgdata_DATA += base/help-ras-base.txt
headers += \ headers += \
base/base.h base/base.h

35
orte/mca/ras/base/help-ras-base.txt Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for the RAS base.
#
[ras-base:no-allocation]
We were unable to find an allocation for this job as required by
setting the "allocation required" flag. Please ensure you have
the necessary allocation before executing again.
If you wish to execute without a provided allocation (e.g., by
providing a user-specified hostfile), please ensure that the "allocation
required" flag is not set. This flag can be set in several forms, so
please check that none of the following exist:
MCA param file: orte_allocation_required = 1
Environment: OMPI_MCA_orte_allocation_required=1
Cmd line: -mca orte_allocation_required 1

Просмотреть файл

@ -29,6 +29,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/util/hostfile/hostfile.h" #include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h" #include "orte/util/dash_host/dash_host.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
@ -110,6 +111,15 @@ int orte_ras_base_allocate(orte_job_t *jdata)
} }
OBJ_DESTRUCT(&nodes); OBJ_DESTRUCT(&nodes);
goto DISPLAY; goto DISPLAY;
} else if (orte_allocation_required) {
/* if nothing was found, and an allocation is
* required, then error out
*/
OBJ_DESTRUCT(&nodes);
orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_wakeup();
return ORTE_ERROR;
} }

Просмотреть файл

@ -64,6 +64,7 @@ char **orte_launch_environ;
opal_pointer_array_t orte_daemonmap; opal_pointer_array_t orte_daemonmap;
bool orte_hnp_is_allocated = false; bool orte_hnp_is_allocated = false;
bool orte_allocation_required;
char *orte_launch_agent; char *orte_launch_agent;
char **orted_cmd_line=NULL; char **orted_cmd_line=NULL;

Просмотреть файл

@ -349,6 +349,7 @@ ORTE_DECLSPEC extern char **orte_launch_environ;
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap; ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
ORTE_DECLSPEC extern bool orte_hnp_is_allocated; ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
ORTE_DECLSPEC extern bool orte_allocation_required;
ORTE_DECLSPEC extern char *orte_launch_agent; ORTE_DECLSPEC extern char *orte_launch_agent;
ORTE_DECLSPEC extern char **orted_cmd_line; ORTE_DECLSPEC extern char **orted_cmd_line;

Просмотреть файл

@ -164,7 +164,13 @@ int orte_register_params(void)
mca_base_param_reg_string_name("orte", "launch_agent", mca_base_param_reg_string_name("orte", "launch_agent",
"Command used to start processes on remote nodes (default: orted)", "Command used to start processes on remote nodes (default: orted)",
false, false, "orted", &orte_launch_agent); false, false, "orted", &orte_launch_agent);
/* whether or not to require RM allocation */
mca_base_param_reg_int_name("orte", "allocation_required",
"Whether or not an allocation by a resource manager is required [default: no]",
false, false, (int)false, &value);
orte_allocation_required = OPAL_INT_TO_BOOL(value);
#endif /* ORTE_DISABLE_FULL_SUPPORT */ #endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS; return ORTE_SUCCESS;