Extend the ras module interface to include the orte_job_t being allocated so that dynamic allocations can be supported
This commit was SVN r27627.
Этот коммит содержится в:
родитель
994d1aba50
Коммит
1237f8db57
@ -49,7 +49,7 @@ typedef struct orte_ras_alps_sysconfig_t {
|
||||
} orte_ras_alps_sysconfig_t;
|
||||
|
||||
/* /// Local Functions /// */
|
||||
static int orte_ras_alps_allocate(opal_list_t *nodes);
|
||||
static int orte_ras_alps_allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
|
||||
static int orte_ras_alps_finalize(void);
|
||||
|
||||
@ -287,7 +287,7 @@ orte_ras_get_appinfo_path(void)
|
||||
* requested number of nodes/process slots to the job.
|
||||
*/
|
||||
static int
|
||||
orte_ras_alps_allocate(opal_list_t *nodes)
|
||||
orte_ras_alps_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
{
|
||||
int ret;
|
||||
char *appinfo_path = NULL;
|
||||
|
@ -141,7 +141,7 @@ void orte_ras_base_allocate(int fd, short args, void *cbdata)
|
||||
*/
|
||||
if (NULL != orte_ras_base.active_module) {
|
||||
/* read the allocation */
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(&nodes))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(jdata, &nodes))) {
|
||||
if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) {
|
||||
/* this module indicates that nodes will be discovered
|
||||
* on a bootstrap basis, so all we do here is add our
|
||||
|
@ -41,7 +41,7 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_ccp_allocate(opal_list_t *nodes);
|
||||
static int orte_ras_ccp_allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
static int orte_ras_ccp_finalize(void);
|
||||
static int discover(opal_list_t* nodelist, ICluster* pCluster);
|
||||
void ras_get_cluster_message(ICluster* pCluster);
|
||||
@ -60,7 +60,7 @@ orte_ras_base_module_t orte_ras_ccp_module = {
|
||||
* Discover available (pre-allocated) nodes. Allocate the
|
||||
* requested number of nodes/process slots to the job.
|
||||
*/
|
||||
static int orte_ras_ccp_allocate(opal_list_t *nodes)
|
||||
static int orte_ras_ccp_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
{
|
||||
int ret, i;
|
||||
size_t len;
|
||||
|
@ -37,7 +37,7 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_gridengine_allocate(opal_list_t *nodes);
|
||||
static int orte_ras_gridengine_allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
static int orte_ras_gridengine_finalize(void);
|
||||
#if 0
|
||||
static int get_slot_count(char* node_name, int* slot_cnt);
|
||||
@ -56,7 +56,7 @@ orte_ras_base_module_t orte_ras_gridengine_module = {
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
static int orte_ras_gridengine_allocate(opal_list_t *nodelist)
|
||||
static int orte_ras_gridengine_allocate(orte_job_t *jdata, opal_list_t *nodelist)
|
||||
{
|
||||
char *pe_hostfile = getenv("PE_HOSTFILE");
|
||||
char *job_id = getenv("JOB_ID");
|
||||
|
@ -38,7 +38,7 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_loadleveler_allocate(opal_list_t *nodes);
|
||||
static int orte_ras_loadleveler_allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
static int orte_ras_loadleveler_finalize(void);
|
||||
|
||||
static int orte_ras_loadleveler_discover(opal_list_t *nodelist);
|
||||
@ -59,7 +59,7 @@ orte_ras_base_module_t orte_ras_loadleveler_module = {
|
||||
* Discover available (pre-allocated) nodes. Allocate the
|
||||
* requested number of nodes/process slots to the job.
|
||||
*/
|
||||
static int orte_ras_loadleveler_allocate(opal_list_t *nodes)
|
||||
static int orte_ras_loadleveler_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
{
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
|
@ -38,7 +38,7 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int allocate(opal_list_t *nodes);
|
||||
static int allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
static int finalize(void);
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ orte_ras_base_module_t orte_ras_lsf_module = {
|
||||
};
|
||||
|
||||
|
||||
static int allocate(opal_list_t *nodes)
|
||||
static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
{
|
||||
char **nodelist;
|
||||
orte_node_t *node;
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -28,7 +28,7 @@
|
||||
* that ORTE users will execute an "mpirun" or other command that
|
||||
* invokes ORTE through one of two channels:
|
||||
*
|
||||
* 1. local: the user will login to the computing resource they intend
|
||||
* 1. the user will login to the computing resource they intend
|
||||
* to use, request a resource allocation from that system, and then
|
||||
* execute the mpirun or other command. Thus, the allocation has
|
||||
* already been obtained prior to ORTE's initialization. In most
|
||||
@ -38,121 +38,12 @@
|
||||
* seek to support (e.g., an LSF component should know that LSF passes
|
||||
* allocation parameters as a specific LSF-named entity).
|
||||
*
|
||||
* 2. remote: the user issues an mpirun command on their notebook or
|
||||
* desktop computer, indicating that the application is to be executed
|
||||
* on a specific remote resource. In this case, the allocation may
|
||||
* not have been previously requested or made. Thus, the associated
|
||||
* 2. the user issues an mpirun command or an application that uses
|
||||
* ORTE without obtaining an allocation in advance. Thus, the associated
|
||||
* RAS component must know how to request an allocation from the
|
||||
* designated resource. To assist in this process, the RAS can turn to
|
||||
* the information provided by the resource discovery subsystem (RDS)
|
||||
* to learn what allocator resides on the designated resource.
|
||||
*
|
||||
* The RAS operates on a per-job basis - i.e., it serves to allocate
|
||||
* the resources for a specific job. It takes several inputs,
|
||||
* depending upon what is provided and desired:
|
||||
*
|
||||
* - the jobid for which the resources are to be allocated. There are
|
||||
* two options here: (a) the jobid can be predefined and provided to
|
||||
* the allocator. In this case, the allocator will simply allocate
|
||||
* resources to the job; or (b) the jobid can be set by the allocator
|
||||
* via a request to the ORTE name services (NS) subsystem. This option
|
||||
* is selected by calling the allocate function with the illegal jobid
|
||||
* of ORTE_JOBID_MAX. In this case, the new jobid (set by the
|
||||
* allocator) will be returned in the provided address (the allocate
|
||||
* function takes a pointer to the jobid as its argument).
|
||||
*
|
||||
* - MCA parameters specifying preallocated resources. These resources
|
||||
* are allocated to the specified jobid (whether set by the allocator
|
||||
* or not) on the first request. However, subsequent requests for
|
||||
* allocation do NOT use these parameters - the parameters are "unset"
|
||||
* after initial use. This is done to prevent subsequent allocation
|
||||
* requests from unintentionally overloading the specified resources
|
||||
* in cases where the univese is persistent and therefore servicing
|
||||
* multiple applications.
|
||||
*
|
||||
* - MCA parameters specifying the name of the application(s) and the
|
||||
* number of each application to be executed. These will usually be
|
||||
* taken from the command line options, but could be provided via
|
||||
* environmental parameters.
|
||||
*
|
||||
* - the resources defined in the ORTE_RESOURCE_SEGMENT by the
|
||||
* RDS. When an allocation is requested for resources not previously
|
||||
* allocated, the RAS will attempt to obtain an allocation that meets
|
||||
* the specified requirements. For example, if the user specifies that
|
||||
* the application must run on an Intel Itanium 2 resource under the
|
||||
* Linux operating system, but doesn't provide the allocation or
|
||||
* resource identification, then the allocator can (if possible)
|
||||
* search the ORTE_RESOURCE_SEGMENT for resources meeting those
|
||||
* specifications and attempt to obtain an allocation from them.
|
||||
*
|
||||
* The RAS outputs its results into three registry segments:
|
||||
*
|
||||
* (a) the ORTE_NODE_STATUS_SEGMENT. The segment consists of a
|
||||
* registry container for each node that has been allocated to a job -
|
||||
* for proper operation, each container MUST be described by the
|
||||
* following set of tokens:
|
||||
*
|
||||
* - nodename: a unique name assigned to each node, usually obtained
|
||||
* from the preallocated information in the environmental variables or
|
||||
* the resource manager for the specified compute resource (e.g.,
|
||||
* LSF). For those cases where specific nodenames are not provided,
|
||||
* the allocator can use the info provided by the RDS to attempt to
|
||||
* determine the nodenames (e.g., if the RDS learned that the nodes
|
||||
* are name q0-q1024 and we obtain an allocation of 100 nodes
|
||||
* beginning at node 512, then the RAS can derive the nodenames from
|
||||
* this information).
|
||||
*
|
||||
* For each node, the RAS stores the following information on the segment:
|
||||
*
|
||||
* - number of cpus allocated from this node to the user. This will
|
||||
* normally be the number of cpus/node as obtained from the data
|
||||
* provided by the RDS, but could differ in some systems.
|
||||
*
|
||||
* - the jobids that are utilizing this node. In systems that allow
|
||||
* overloading of processes onto nodes, there may be multiple jobs
|
||||
* sharing a given node.
|
||||
*
|
||||
* - the status of the node (up, down, rebooting, etc.). This
|
||||
* information is provided and updated by the state-of-health (SOH)
|
||||
* monitoring subsystem.
|
||||
*
|
||||
* (b) the ORTE_JOB_SEGMENT. The RAS preallocates this segment,
|
||||
* initializing one container for each process plus one container to
|
||||
* store information that spans the job. This latter container houses
|
||||
* information such as the application names, number of processes per
|
||||
* application, process context (including argv and enviro arrays),
|
||||
* and i/o forwarding info. The RAS does NOT establish nor fill any of
|
||||
* the individual process info containers - rather, it preallocates
|
||||
* the storage for those containers and places some of the job-wide
|
||||
* information into that container. This info includes:
|
||||
*
|
||||
* - application names and number of processes per application
|
||||
*
|
||||
* - process context
|
||||
*
|
||||
* The remainder of the information in that container will be supplied
|
||||
* by other subsystems.
|
||||
*
|
||||
* (c) the ORTE_RESOURCE_SEGMENT. The RAS adds information to this
|
||||
* segment to indicate consumption of an available resource. In
|
||||
* particular, the RAS updates fields in the respective compute
|
||||
* resource to indicate the portion of that resource that has been
|
||||
* allocated and therefore can be presumed consumed. This includes
|
||||
* info on the number of nodes and cpus allocated to existing jobs -
|
||||
* these numbers are updated by the RAS when resources are deallocated
|
||||
* at the completion of a job.
|
||||
*
|
||||
* The information provided by the RAS is consumed by the resource
|
||||
* mapper subsystem (RMAPS) that defines which process is executed
|
||||
* upon which node/cpu, the process launch subsystem (PLS) that
|
||||
* actually launches each process, and others.
|
||||
*
|
||||
* Because the RAS operates as a multi-component framework (i.e.,
|
||||
* multiple components may be simultaneously instantiated), the RAS
|
||||
* functions should NOT be called directly. Instead, they should be
|
||||
* accessed via the ORTE resource manager (RMGR) subsystem.
|
||||
*
|
||||
*
|
||||
* designated resource. If it doesn't, or it cannot obtain the allocation,
|
||||
* then it shall indicate this by setting the system to an appropriate
|
||||
* state.
|
||||
*/
|
||||
|
||||
#ifndef ORTE_MCA_RAS_H
|
||||
@ -185,7 +76,8 @@ ORTE_DECLSPEC extern opal_event_t orte_allocate_event;
|
||||
/**
|
||||
* Allocate resources to a job.
|
||||
*/
|
||||
typedef int (*orte_ras_base_module_allocate_fn_t)(opal_list_t *nodes);
|
||||
typedef int (*orte_ras_base_module_allocate_fn_t)(orte_job_t *jdata,
|
||||
opal_list_t *nodes);
|
||||
|
||||
/**
|
||||
* Cleanup module resources.
|
||||
|
@ -28,7 +28,7 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int allocate(opal_list_t *nodes);
|
||||
static int allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
static int finalize(void);
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@ orte_ras_base_module_t orte_ras_sim_module = {
|
||||
finalize
|
||||
};
|
||||
|
||||
static int allocate(opal_list_t *nodes)
|
||||
static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
{
|
||||
int i, n, val, dig, num_nodes;
|
||||
orte_node_t *node;
|
||||
|
@ -40,7 +40,7 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int orte_ras_slurm_allocate(opal_list_t *nodes);
|
||||
static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
static int orte_ras_slurm_finalize(void);
|
||||
|
||||
static int orte_ras_slurm_discover(char *regexp, char* tasks_per_node,
|
||||
@ -63,7 +63,7 @@ orte_ras_base_module_t orte_ras_slurm_module = {
|
||||
* requested number of nodes/process slots to the job.
|
||||
*
|
||||
*/
|
||||
static int orte_ras_slurm_allocate(opal_list_t *nodes)
|
||||
static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
{
|
||||
int ret, cpus_per_task;
|
||||
char *slurm_node_str, *regexp;
|
||||
|
@ -38,7 +38,7 @@
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
static int allocate(opal_list_t *nodes);
|
||||
static int allocate(orte_job_t *jdata, opal_list_t *nodes);
|
||||
static int finalize(void);
|
||||
|
||||
static int discover(opal_list_t* nodelist, char *pbs_jobid);
|
||||
@ -62,7 +62,7 @@ orte_ras_base_module_t orte_ras_tm_module = {
|
||||
* them back to the caller.
|
||||
*
|
||||
*/
|
||||
static int allocate(opal_list_t *nodes)
|
||||
static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
{
|
||||
int ret;
|
||||
char *pbs_jobid;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user