Establish an MCA param "orte_allocation_required" so that a system can require the user have an RM-provided allocation in order to run. This helps prevent the problem where a user forgets to get an allocation on an RM-managed cluster, and then executes mpirun on the head node - thus causing all of their mpi procs to launch on the head node, usually bringing it to its knees.
Since OMPI allows mpirun to default to the local node, and since users want to retain the option to co-locate procs with mpirun, we needed another param to block this error case. This commit was SVN r19135.
Этот коммит содержится в:
родитель
0d08866786
Коммит
35a86b3347
@ -22,6 +22,7 @@ libmca_ras_la_SOURCES =
|
|||||||
|
|
||||||
# header setup
|
# header setup
|
||||||
nobase_orte_HEADERS =
|
nobase_orte_HEADERS =
|
||||||
|
dist_pkgdata_DATA =
|
||||||
|
|
||||||
# local files
|
# local files
|
||||||
headers = ras.h ras_types.h
|
headers = ras.h ras_types.h
|
||||||
|
@ -16,6 +16,8 @@
|
|||||||
# $HEADER$
|
# $HEADER$
|
||||||
#
|
#
|
||||||
|
|
||||||
|
dist_pkgdata_DATA += base/help-ras-base.txt
|
||||||
|
|
||||||
headers += \
|
headers += \
|
||||||
base/base.h
|
base/base.h
|
||||||
|
|
||||||
|
35
orte/mca/ras/base/help-ras-base.txt
Обычный файл
35
orte/mca/ras/base/help-ras-base.txt
Обычный файл
@ -0,0 +1,35 @@
|
|||||||
|
# -*- text -*-
|
||||||
|
#
|
||||||
|
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||||
|
# University Research and Technology
|
||||||
|
# Corporation. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||||
|
# of Tennessee Research Foundation. All rights
|
||||||
|
# reserved.
|
||||||
|
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||||
|
# University of Stuttgart. All rights reserved.
|
||||||
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
|
# All rights reserved.
|
||||||
|
# $COPYRIGHT$
|
||||||
|
#
|
||||||
|
# Additional copyrights may follow
|
||||||
|
#
|
||||||
|
# $HEADER$
|
||||||
|
#
|
||||||
|
# This is the US/English general help file for the RAS base.
|
||||||
|
#
|
||||||
|
[ras-base:no-allocation]
|
||||||
|
We were unable to find an allocation for this job as required by
|
||||||
|
setting the "allocation required" flag. Please ensure you have
|
||||||
|
the necessary allocation before executing again.
|
||||||
|
|
||||||
|
If you wish to execute without a provided allocation (e.g., by
|
||||||
|
providing a user-specified hostfile), please ensure that the "allocation
|
||||||
|
required" flag is not set. This flag can be set in several forms, so
|
||||||
|
please check that none of the following exist:
|
||||||
|
|
||||||
|
MCA param file: orte_allocation_required = 1
|
||||||
|
Environment: OMPI_MCA_orte_allocation_required=1
|
||||||
|
Cmd line: -mca orte_allocation_required 1
|
||||||
|
|
||||||
|
|
@ -29,6 +29,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/runtime/orte_wakeup.h"
|
||||||
#include "orte/util/hostfile/hostfile.h"
|
#include "orte/util/hostfile/hostfile.h"
|
||||||
#include "orte/util/dash_host/dash_host.h"
|
#include "orte/util/dash_host/dash_host.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
@ -110,6 +111,15 @@ int orte_ras_base_allocate(orte_job_t *jdata)
|
|||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&nodes);
|
OBJ_DESTRUCT(&nodes);
|
||||||
goto DISPLAY;
|
goto DISPLAY;
|
||||||
|
} else if (orte_allocation_required) {
|
||||||
|
/* if nothing was found, and an allocation is
|
||||||
|
* required, then error out
|
||||||
|
*/
|
||||||
|
OBJ_DESTRUCT(&nodes);
|
||||||
|
orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
|
||||||
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
|
orte_wakeup();
|
||||||
|
return ORTE_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,6 +64,7 @@ char **orte_launch_environ;
|
|||||||
opal_pointer_array_t orte_daemonmap;
|
opal_pointer_array_t orte_daemonmap;
|
||||||
|
|
||||||
bool orte_hnp_is_allocated = false;
|
bool orte_hnp_is_allocated = false;
|
||||||
|
bool orte_allocation_required;
|
||||||
|
|
||||||
char *orte_launch_agent;
|
char *orte_launch_agent;
|
||||||
char **orted_cmd_line=NULL;
|
char **orted_cmd_line=NULL;
|
||||||
|
@ -349,6 +349,7 @@ ORTE_DECLSPEC extern char **orte_launch_environ;
|
|||||||
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
|
ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
|
ORTE_DECLSPEC extern bool orte_hnp_is_allocated;
|
||||||
|
ORTE_DECLSPEC extern bool orte_allocation_required;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern char *orte_launch_agent;
|
ORTE_DECLSPEC extern char *orte_launch_agent;
|
||||||
ORTE_DECLSPEC extern char **orted_cmd_line;
|
ORTE_DECLSPEC extern char **orted_cmd_line;
|
||||||
|
@ -164,7 +164,13 @@ int orte_register_params(void)
|
|||||||
mca_base_param_reg_string_name("orte", "launch_agent",
|
mca_base_param_reg_string_name("orte", "launch_agent",
|
||||||
"Command used to start processes on remote nodes (default: orted)",
|
"Command used to start processes on remote nodes (default: orted)",
|
||||||
false, false, "orted", &orte_launch_agent);
|
false, false, "orted", &orte_launch_agent);
|
||||||
|
|
||||||
|
/* whether or not to require RM allocation */
|
||||||
|
mca_base_param_reg_int_name("orte", "allocation_required",
|
||||||
|
"Whether or not an allocation by a resource manager is required [default: no]",
|
||||||
|
false, false, (int)false, &value);
|
||||||
|
orte_allocation_required = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user