d0eb7d7216
Modify the mapper to better bookmark its stopping place each time, and to pick up the next time from there. This needs to be validated on a multi-node system. Fix a major memory corruption problem in the registry put/get functions that was doing multiple free's. Not sure how valgrind missed this one, though it only occurred in specific circumstances (such as comm_spawn). This commit was SVN r12179.
331 строка
12 KiB
C
331 строка
12 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/** @file:
|
|
*
|
|
* The Open RTE Resource Manager (RMGR) Subsystem
|
|
*
|
|
* The resource manager (RMGR) subsystem serves as the central
|
|
* switchyard for all resource management activities, including
|
|
* resource discovery, resource allocation, process mapping, and
|
|
* process launch.
|
|
*/
|
|
|
|
#ifndef ORTE_RMGR_H
|
|
#define ORTE_RMGR_H
|
|
|
|
/*
|
|
* includes
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/orte_constants.h"
|
|
|
|
#include "opal/mca/mca.h"
|
|
#include "orte/mca/ns/ns_types.h"
|
|
#include "orte/mca/gpr/gpr_types.h"
|
|
#include "orte/mca/smr/smr_types.h"
|
|
#include "rmgr_types.h"
|
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*
|
|
* Component functions - all MUST be provided!
|
|
*/
|
|
|
|
/**
|
|
* Setup a job. Allocated a jobid and initializes the job segment.
|
|
*
|
|
* @param app_context Array of application context values.
|
|
* @param num_context Number of entries in the app_context array.
|
|
* @param jobid Returns id allocated to the job.
|
|
*
|
|
* @code
|
|
* orte_jobid_t jobid;
|
|
*
|
|
* return_value = orte_rmgr.setup_job(app_context,num_context,&jobid);
|
|
* @endcode
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_setup_job_fn_t)(
|
|
orte_app_context_t** app_context,
|
|
orte_std_cntr_t num_context,
|
|
orte_jobid_t *jobid);
|
|
|
|
/*
|
|
* Callback function for resource manager
|
|
*/
|
|
typedef void (*orte_rmgr_cb_fn_t)(orte_jobid_t jobid, orte_proc_state_t state);
|
|
|
|
/**
|
|
* Shortcut to spawn an applications. Perform all steps required to
|
|
* launch the specified application.
|
|
*
|
|
* (1) Create the application context - create a jobid
|
|
* (2) Allocated resources to the job.
|
|
* (3) Map processes to allocated resources
|
|
* (4) Launch the job.
|
|
* (5) Callback function - gets called all procs reach specified conditions (if NULL, then no callback done)
|
|
* (6) callback conditions - flag indicating which triggers are to generate callbacks to the specified function
|
|
*
|
|
* @code
|
|
* orte_jobid_t jobid;
|
|
*
|
|
* return_value = orte_rmgr.spawn(app_context, num_context, &jobid, NULL, 0);
|
|
* @endcode
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_spawn_job_fn_t)(
|
|
orte_app_context_t** app_context,
|
|
orte_std_cntr_t num_context,
|
|
orte_jobid_t *jobid,
|
|
orte_std_cntr_t num_connect,
|
|
orte_process_name_t *connect,
|
|
orte_rmgr_cb_fn_t cbfn,
|
|
orte_proc_state_t cb_conditions,
|
|
opal_list_t *attributes);
|
|
|
|
|
|
/**
|
|
* Connect a process to other processes, possibly in other jobs. Note that this
|
|
* function supports WILDCARD process name fields. Thus, a process can request
|
|
* connection to all other processes in another job by providing a single
|
|
* entry in the connect array that has a cellid of ORTE_CELLID_WILDCARD, the
|
|
* desired jobid, and a vpid of ORTE_VPID_WILDCARD.
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_connect_fn_t)(orte_std_cntr_t num_connect,
|
|
orte_process_name_t *connect);
|
|
|
|
/**
|
|
* Disconnect a process from one or more other processes. Note that this
|
|
* function supports WILDCARD process name fields. Thus, a process can request
|
|
* to disconnect from all other processes in another job by providing a single
|
|
* entry in the connect array that has a cellid of ORTE_CELLID_WILDCARD, the
|
|
* desired jobid, and a vpid of ORTE_VPID_WILDCARD.
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_disconnect_fn_t)(orte_std_cntr_t num_disconnect,
|
|
orte_process_name_t *disconnect);
|
|
|
|
|
|
/**
|
|
* Allow module-specific init.
|
|
*/
|
|
|
|
typedef int (*orte_rmgr_base_module_init_fn_t)(void);
|
|
|
|
/**
|
|
* Cleanup resources held by rmgr.
|
|
*/
|
|
|
|
typedef int (*orte_rmgr_base_module_finalize_fn_t)(void);
|
|
|
|
/**
|
|
* GENERAL UTILITY FUNCTIONS
|
|
*/
|
|
|
|
/**
|
|
* Find an attribute
|
|
* Given a list of attributes, return a pointer to the specified attribute
|
|
*
|
|
* @param attr_list A pointer to the list of attributes
|
|
* @param key The key indicating the attribute to be returned.
|
|
* @retval ptr A pointer to the orte_gpr_keyval_t containing the attribute.
|
|
* Note that this value is *not* being duplicated, nor is it being
|
|
* removed from the list, so alterations or release of the object will impact the list!
|
|
* @retval NULL The specified attribute was not found - this function does not
|
|
* consider this to be an error, so no ORTE_ERROR_LOG is printed out when this happens.
|
|
*/
|
|
typedef orte_attribute_t* (*orte_rmgr_base_module_find_attribute_fn_t)(opal_list_t* attr_list, char* key);
|
|
|
|
/**
|
|
* Add an attribute
|
|
* Given a list of attributes and the data for a new attribute,
|
|
* this function will create the gpr_keyval_t object for that attribute,
|
|
* populate it with the provided data, and append it to the list. If
|
|
* overwrite is set to true AND the value is found on the list, then
|
|
* it will be overwritten with the new value. If overwrite is NOT set
|
|
* and the value is found on the list, it will be left alone - the value
|
|
* will NOT be updated with the one provided.
|
|
*
|
|
* @param attr_list A pointer to the list of attributes
|
|
* @param key The key for the attribute.
|
|
* @param type The data type to be stored in the attribute. A value
|
|
* of ORTE_UNDEF is acceptable to indicate that no data is being stored -
|
|
* the existence of the attribute on the list is all that is required.
|
|
* @param data A pointer to the data to be stored in the attribute. NULL
|
|
* is acceptable IF the data type is ORTE_UNDEF.
|
|
* @param overwrite Indicates if a pre-existing value can be overwritten or not
|
|
* @retval ORTE_SUCCESS Attribute was added to list.
|
|
* @retval ORTE_ERROR An appropriate error code indicating what went wrong.
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_add_attribute_fn_t)(opal_list_t* attr_list, char* key,
|
|
orte_data_type_t type, void *data,
|
|
bool overwrite);
|
|
|
|
|
|
/**
|
|
* Merge two attribute lists
|
|
* Given two lists of attributes, this function will merge the second list into
|
|
* the first. The boolean defines how to handle matching entries - if set to
|
|
* true (ORTE_RMGR_ATTR_OVERRIDE), entries in the second list will OVERWRITE
|
|
* matching entries in the first list. If set to false (ORTE_RMGR_ATTR_NO_OVERRIDE)
|
|
* matching entries in the second list will be ignored.
|
|
*
|
|
* @param attr_list A pointer to the list of attributes
|
|
* @param key The key for the attribute.
|
|
* @param type The data type to be stored in the attribute. A value
|
|
* of ORTE_UNDEF is acceptable to indicate that no data is being stored -
|
|
* the existence of the attribute on the list is all that is required.
|
|
* @param data A pointer to the data to be stored in the attribute. NULL
|
|
* is acceptable IF the data type is ORTE_UNDEF.
|
|
* @retval ORTE_SUCCESS Attribute was added to list.
|
|
* @retval ORTE_ERROR An appropriate error code indicating what went wrong.
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_merge_attributes_fn_t)(opal_list_t* target,
|
|
opal_list_t* source,
|
|
bool override);
|
|
|
|
|
|
/**
|
|
* Delete an attribute
|
|
* Given a pointer array of attributes, delete the specified attribute
|
|
*
|
|
* @param attr_list A pointer to the list of attributes
|
|
* @param key The key indicating the attribute to be deleted.
|
|
* @retval ORTE_SUCCESS Attribute was added to list.
|
|
* @retval ORTE_ERROR An appropriate error code indicating what went wrong. Note that
|
|
* an error code of NOT_FOUND will be returned if the specified attribute is
|
|
* not on the provided list - it is up to the caller to decide if this is an actual
|
|
* error. This function will NOT do an ORTE_ERROR_LOG in the case of NOT_FOUND.
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_delete_attribute_fn_t)(opal_list_t* attr_list, char* key);
|
|
|
|
|
|
/*** APP_CONTEXT FUNCTIONS ***/
|
|
/*
|
|
* Store an array of app_context objects for a given job/pset
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_store_app_context_fn_t)(orte_jobid_t jobid,
|
|
orte_app_context_t** app_context,
|
|
orte_std_cntr_t num_context);
|
|
|
|
/*
|
|
* Get an array of app_context objects for a given job/pset
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_get_app_context_fn_t)(orte_jobid_t jobid,
|
|
orte_app_context_t ***app_context,
|
|
orte_std_cntr_t *num_context);
|
|
|
|
/*
|
|
* Check the app_context for changing to a working dir or the HOME dir
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_check_context_cwd_fn_t)(orte_app_context_t *context,
|
|
bool want_chdir);
|
|
|
|
/*
|
|
* Check app_context application for existence
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_check_context_app_fn_t)(orte_app_context_t *context);
|
|
|
|
/**
|
|
* VPID FUNCTIONS
|
|
*/
|
|
|
|
/**
|
|
* Store the vpid range of a job
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_set_vpid_range_fn_t)(orte_jobid_t jobid,
|
|
orte_vpid_t start,
|
|
orte_vpid_t range);
|
|
|
|
|
|
/**
|
|
* Retrieve the vpid range of a job
|
|
*/
|
|
typedef int (*orte_rmgr_base_module_get_vpid_range_fn_t)(orte_jobid_t jobid,
|
|
orte_vpid_t *start,
|
|
orte_vpid_t *range);
|
|
|
|
|
|
/*
|
|
* Ver 2.0
|
|
*/
|
|
struct orte_rmgr_base_module_2_0_0_t {
|
|
orte_rmgr_base_module_init_fn_t module_init;
|
|
orte_rmgr_base_module_setup_job_fn_t setup_job;
|
|
orte_rmgr_base_module_spawn_job_fn_t spawn_job;
|
|
orte_rmgr_base_module_connect_fn_t connect;
|
|
orte_rmgr_base_module_disconnect_fn_t disconnect;
|
|
orte_rmgr_base_module_finalize_fn_t finalize;
|
|
/** SUPPORT FUNCTIONS ***/
|
|
orte_rmgr_base_module_find_attribute_fn_t find_attribute;
|
|
orte_rmgr_base_module_add_attribute_fn_t add_attribute;
|
|
orte_rmgr_base_module_merge_attributes_fn_t merge_attributes;
|
|
orte_rmgr_base_module_delete_attribute_fn_t delete_attribute;
|
|
orte_rmgr_base_module_get_app_context_fn_t get_app_context;
|
|
orte_rmgr_base_module_store_app_context_fn_t store_app_context;
|
|
orte_rmgr_base_module_check_context_cwd_fn_t check_context_cwd;
|
|
orte_rmgr_base_module_check_context_app_fn_t check_context_app;
|
|
orte_rmgr_base_module_set_vpid_range_fn_t set_vpid_range;
|
|
orte_rmgr_base_module_get_vpid_range_fn_t get_vpid_range;
|
|
};
|
|
|
|
typedef struct orte_rmgr_base_module_2_0_0_t orte_rmgr_base_module_2_0_0_t;
|
|
typedef orte_rmgr_base_module_2_0_0_t orte_rmgr_base_module_t;
|
|
|
|
/*
|
|
* RMGR Component
|
|
*/
|
|
|
|
typedef orte_rmgr_base_module_t* (*orte_rmgr_base_component_init_fn_t)(
|
|
int *priority);
|
|
|
|
|
|
/*
|
|
* the standard component data structure
|
|
*/
|
|
|
|
struct orte_rmgr_base_component_2_0_0_t {
|
|
mca_base_component_t rmgr_version;
|
|
mca_base_component_data_1_0_0_t rmgr_data;
|
|
orte_rmgr_base_component_init_fn_t rmgr_init;
|
|
};
|
|
typedef struct orte_rmgr_base_component_2_0_0_t orte_rmgr_base_component_2_0_0_t;
|
|
typedef orte_rmgr_base_component_2_0_0_t orte_rmgr_base_component_t;
|
|
|
|
|
|
|
|
/**
|
|
* Macro for use in components that are of type rmgr v2.0.0
|
|
*/
|
|
#define ORTE_RMGR_BASE_VERSION_2_0_0 \
|
|
/* rmgr v2.0 is chained to MCA v1.0 */ \
|
|
MCA_BASE_VERSION_1_0_0, \
|
|
/* rmgr v2.0 */ \
|
|
"rmgr", 2, 0, 0
|
|
|
|
/**
|
|
* Global structure for accessing RAS functions
|
|
*/
|
|
ORTE_DECLSPEC extern orte_rmgr_base_module_t orte_rmgr; /* holds selected module's function pointers */
|
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
}
|
|
#endif
|
|
|
|
#endif
|