1
1

Add several new interface functions to the name services:

1. dump_xxx - analogous to the registry's dump commands, allows you to examine the contents of the name services' structures

2. get_job_peers - get an array of process names for all processes in the specified job

This commit was SVN r6759.
Этот коммит содержится в:
Ralph Castain 2005-08-07 13:21:52 +00:00
родитель 5208f9001d
Коммит c530521a8e
11 изменённых файлов: 1850 добавлений и 755 удалений

Просмотреть файл

@ -34,10 +34,9 @@ libmca_ns_base_la_SOURCES = \
ns_base_close.c \
ns_base_select.c \
ns_base_open.c \
ns_base_nds.c \
ns_base_local_fns.c \
data_type_support/ns_data_type_packing_fns.c \
data_type_support/ns_data_type_unpacking_fns.c
data_type_support/ns_data_type_packing_fns.c \
data_type_support/ns_data_type_unpacking_fns.c
# Conditionally install the header files

Просмотреть файл

@ -39,6 +39,9 @@
extern "C" {
#endif
/* default limits */
#define ORTE_NS_ARRAY_MAX_SIZE INT_MAX
#define ORTE_NS_ARRAY_BLOCK_SIZE 512
/*
* Internal definitions
*/
@ -55,12 +58,18 @@ typedef uint8_t orte_ns_cmd_bitmask_t;
/*
* define flag values for remote commands - only used internally
*/
#define ORTE_NS_CREATE_CELLID_CMD 0x01
#define ORTE_NS_CREATE_JOBID_CMD 0x02
#define ORTE_NS_RESERVE_RANGE_CMD 0x04
#define ORTE_NS_ASSIGN_OOB_TAG_CMD 0x08
#define ORTE_NS_DEFINE_DATA_TYPE_CMD 0x10
#define ORTE_NS_CREATE_MY_NAME_CMD 0x20
#define ORTE_NS_CREATE_CELLID_CMD (int8_t)0x01
#define ORTE_NS_CREATE_JOBID_CMD (int8_t)0x02
#define ORTE_NS_RESERVE_RANGE_CMD (int8_t)0x04
#define ORTE_NS_ASSIGN_OOB_TAG_CMD (int8_t)0x08
#define ORTE_NS_GET_JOB_PEERS_CMD (int8_t)0x0A
#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t)0x10
#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t)0x20
#define ORTE_NS_DUMP_CELLS_CMD (int8_t)0x21
#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t)0x22
#define ORTE_NS_DUMP_TAGS_CMD (int8_t)0x23
#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t)0x24
/*
* function definitions
@ -119,6 +128,9 @@ OMPI_DECLSPEC int orte_ns_base_compare(orte_ns_cmp_bitmask_t fields,
OMPI_DECLSPEC int orte_ns_base_free_name(orte_process_name_t **name);
OMPI_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer, int output_id);
/* not available functions */
OMPI_DECLSPEC int orte_ns_base_module_init_not_available(void);
@ -146,6 +158,14 @@ OMPI_DECLSPEC int orte_ns_base_define_data_type_not_available(
OMPI_DECLSPEC int orte_ns_base_create_my_name_not_available(void);
OMPI_DECLSPEC int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
OMPI_DECLSPEC int orte_ns_base_dump_cells_not_available(int output_id);
OMPI_DECLSPEC int orte_ns_base_dump_jobs_not_available(int output_id);
OMPI_DECLSPEC int orte_ns_base_dump_tags_not_available(int output_id);
OMPI_DECLSPEC int orte_ns_base_dump_datatypes_not_available(int output_id);
/* Base functions used everywhere */
OMPI_DECLSPEC int orte_ns_base_get_peers(orte_process_name_t **procs,
size_t *num_procs, size_t *self);

Просмотреть файл

@ -111,6 +111,43 @@ orte_ns_base_create_my_name_not_available(void)
return ORTE_ERR_UNREACH;
}
int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job)
{
*procs = NULL;
*num_procs = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_cells_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_jobs_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_tags_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_datatypes_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/*
@ -566,3 +603,72 @@ int orte_ns_base_free_name(orte_process_name_t **name)
return ORTE_SUCCESS;
}
int orte_ns_base_get_peers(orte_process_name_t **procs,
size_t *num_procs, size_t *self)
{
size_t i;
int rc;
orte_cellid_t mycellid;
orte_jobid_t myjobid;
orte_vpid_t myvpid;
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs *
sizeof(orte_process_name_t));
if (NULL == *procs) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mycellid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != orte_ns.get_jobid(&myjobid, orte_process_info.my_name)) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != orte_ns.get_vpid(&myvpid, orte_process_info.my_name)) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i=0; i < orte_process_info.num_procs; i++) {
(*procs)[i].cellid = mycellid;
(*procs)[i].jobid = myjobid;
(*procs)[i].vpid = orte_process_info.vpid_start + i;
}
*num_procs = orte_process_info.num_procs;
*self = (size_t)(myvpid - orte_process_info.vpid_start);
return ORTE_SUCCESS;
}
/*
* DIAGNOSTIC FUNCTIONS
*/
int orte_ns_base_print_dump(orte_buffer_t *buffer, int output_id)
{
char *line;
size_t n;
orte_data_type_t type;
int rc;
n = 1;
while (ORTE_SUCCESS == orte_dps.peek(buffer, &type, &n)) {
if (ORTE_SUCCESS !=
(rc = orte_dps.unpack(buffer, &line, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output(output_id, "%s", line);
free(line);
n=1;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -48,36 +48,50 @@ orte_process_name_t orte_name_all = {ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_
*/
int mca_ns_base_output = -1;
OMPI_DECLSPEC mca_ns_base_module_t orte_ns = {
/* init */
orte_ns_base_module_init_not_available,
/* cell functions */
orte_ns_base_create_cellid_not_available,
orte_ns_base_get_cellid,
orte_ns_base_get_cell_info_not_available,
orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* jobid functions */
orte_ns_base_create_jobid_not_available,
orte_ns_base_get_jobid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_base_get_vpid_range_not_available,
orte_ns_base_get_vpid,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name,
orte_ns_base_create_my_name_not_available,
orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name,
orte_ns_base_get_vpid_range_not_available,
orte_ns_base_free_name,
orte_ns_base_get_proc_name_string,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
orte_ns_base_get_vpid,
orte_ns_base_get_jobid,
orte_ns_base_get_cellid,
orte_ns_base_compare,
orte_ns_base_derive_vpid,
/* peer functions */
orte_ns_base_get_peers,
orte_ns_base_get_job_peers_not_available,
/* tag server functions */
orte_ns_base_assign_rml_tag_not_available,
/* data type functions */
orte_ns_base_define_data_type_not_available,
orte_ns_base_get_peers
/* diagnostic functions */
orte_ns_base_dump_cells_not_available,
orte_ns_base_dump_jobs_not_available,
orte_ns_base_dump_tags_not_available,
orte_ns_base_dump_datatypes_not_available
};
bool mca_ns_base_selected = false;
opal_list_t mca_ns_base_components_available;
mca_ns_base_component_t mca_ns_base_selected_component;

Просмотреть файл

@ -54,6 +54,7 @@ extern "C" {
*/
typedef int (*orte_ns_base_module_init_fn_t)(void);
/**** CELL FUNCTIONS ****/
/**
* Create a new cell id.
* The create_cellid() function allocates a new cell id for use by the caller.
@ -122,6 +123,81 @@ typedef int (*orte_ns_base_module_get_cell_info_fn_t)(orte_cellid_t cellid,
*/
typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_name_t* name);
/**
* Get the cell id as a character string.
* The get_cellid_string() function returns the cell id in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* cell id.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* cellid-string = ompi_name_server.get_cellid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name);
/**
* Convert cellid to character string
* Returns the cellid in a character string representation. The string is created
* by expressing the provided cellid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param cellid The cellid to be converted.
*
* @retval *cellid_string A pointer to a character string representation of the cellid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* cellid-string = ompi_name_server.convert_cellid_to_string(cellid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid);
/**
* Convert a string to a cellid.
* Converts a characters string into a cellid. The character string must be a
* hexadecimal representation of a valid cellid.
*
* @param cellidstring The string to be converted.
*
* @retval cellid The resulting cellid
* @retval MCA_NS_BASE_CELLID_MAX String could not be converted.
*
* @code
* cellid = ompi_name_server.convert_string_to_cellid(cellidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring);
/**
* Get the cell id as a numberic value.
* The get_cellid() function returns the cell id in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval cellid The cell id field of the provided name.
* @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* cellid = ompi_name_server.get_cellid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name);
/**** JOB ID FUNCTIONS ****/
/**
* Create a new job id.
* The create_jobid() function allocates a new job id for use by the caller.
@ -148,91 +224,6 @@ typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_na
*/
typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid);
/**
* Obtain a single new process name.
* The create_process_name() function creates a single process name structure and fills the
* fields with the provided values.
*
* @param cell The cell for which the process name is intended. Usually, this is
* the id of the cell where the process is initially planning to be spawned.
* @param job The id of the job to which the process will belong. Process id's are
* tracked according to jobid, but not cellid. Thus, two processes
* can have the same process id if and only if they have different jobid's. However,
* two processes in the same jobid cannot have the same process id, regardless
* of whether or not they are in the same cell.
* @param vpid The virtual process id for the name. Note that no check is made for uniqueness -
* the caller is responsible for ensuring that the requested name is, in fact, unique
* by first requesting reservation of an appropriate range of virtual process id's.
*
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* new_name = ompi_name_server.create_process_name(cell, job, vpid);
* @endcode
*/
typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
/*
* Create my name
* If a process is a singleton, then it needs to create a name for itself. When
* a persistent daemon is present, this requires a communication to that daemon.
* Since the RML uses process names as its index into the RML communicator table,
* the RML automatically assigns a name to each process when it first attempts
* to communicate. This function takes advantage of that behavior to ensure that
* one, and ONLY one, name gets assigned to the process
*/
typedef int (*orte_ns_base_module_create_my_name_fn_t)(void);
/**
* Derive a process vpid.
* Given a base vpid and an offset, return the computed equivalent vpid. This function
* is required because the vpid may not be an integer - need to provide a means for
* computing the resulting vpid in case it isn't.
*/
typedef int (*orte_ns_base_module_derive_vpid_fn_t)(orte_vpid_t *vpid,
orte_vpid_t base_vpid,
int offset);
/**
* Make a copy of a process name.
* Given a process name, this function creates a copy of it and returns a pointer
* to the duplicate structure.
*
* @param *name Pointer to an existing process name structure.
*
* @retval *newname Pointer to the duplicate structure, with all fields transferred.
* @retval NULL Indicates an error - most likely due to a NULL process name
* pointer being supplied as input.
*/
typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest,
orte_process_name_t* src);
/**
* Convert a string representation to a process name.
* The convert_string_to_process_name() function converts a string representation of a process
* name into an Open MPI name structure. The string must be of the proper form - i.e., it
* must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form.
*
* @param *name_string A character string representation of a process name.
*
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* name = ompi_name_server.convert_string_to_process_name(name_string);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name,
const char* name_string);
/**
* Reserve a range of process id's.
* The reserve_range() function reserves a range of vpid's for the given jobid.
@ -255,112 +246,6 @@ typedef int (*orte_ns_base_module_reserve_range_fn_t)(orte_jobid_t job,
orte_vpid_t range,
orte_vpid_t *startvpid);
/**
* Free (release) a process name.
* The free_name() function releases the process name from the "used" list
* maintained within the name server for the jobid contained in the specified
* name. The memory for the name is also released at that time.
*
* Name values are currently \em not re-used. Hence, free-ing a name
* does not provide any noticeable benefit other than releasing the memory. In
* the future, names may be re-used if this becomes desirable.
*
* @param *name A pointer to the name structure containing the name being released.
*
* @retval OMPI_SUCCESS Indicates the release was succesfully accomplished.
* @retval OMPI_ERROR Indicates the release failed - most likely due to an
* error when free-ing the memory allocation.
*
* @code
* if (OMPI_ERROR == ompi_name_server.free_name(&name) {
* report error
* }
* @endcode
*/
typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name);
/**
* Get the process name as a character string.
* The get_proc_name_string() function returns the entire process name in a
* character string representation. The string is created by expressing each
* field in hexadecimal separated by periods, as follows:
*
* sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid)
*
* The memory required for the string is allocated by the function - releasing
* that allocation is the responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* full name.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* name-string = ompi_name_server.get_proc_name_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
const orte_process_name_t* name);
/**
* Get the virtual process id as a character string.
* The get_vpid_string() function returns the vpid in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* vpid.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* vpid-string = ompi_name_server.get_vpid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, const orte_process_name_t* name);
/**
* Convert vpid to character string
* Returns the vpid in a character string representation. The string is created
* by expressing the provided vpid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param vpid The vpid to be converted.
*
* @retval *vpid_string A pointer to a character string representation of the vpid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* vpid-string = ompi_name_server.convert_vpid_to_string(vpid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_vpid_to_string_fn_t)(char **vpid_string, const orte_vpid_t vpid);
/**
* Convert a string to a vpid.
* Converts a characters string into a vpid. The character string must be a
* hexadecimal representation of a valid vpid.
*
* @param vpidstring The string to be converted.
*
* @retval vpid The resulting vpid
* @retval MCA_NS_BASE_VPID_MAX String could not be converted.
*
* @code
* vpid = ompi_name_server.convert_string_to_vpid(vpidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring);
/**
* Get the job id as a character string.
* The get_jobid_string() function returns the job id in a character string
@ -418,78 +303,6 @@ typedef int (*orte_ns_base_module_convert_jobid_to_string_fn_t)(char **jobid_str
*/
typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jobid, const char* jobidstring);
/**
* Get the cell id as a character string.
* The get_cellid_string() function returns the cell id in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* cell id.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* cellid-string = ompi_name_server.get_cellid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name);
/**
* Convert cellid to character string
* Returns the cellid in a character string representation. The string is created
* by expressing the provided cellid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param cellid The cellid to be converted.
*
* @retval *cellid_string A pointer to a character string representation of the cellid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* cellid-string = ompi_name_server.convert_cellid_to_string(cellid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid);
/**
* Convert a string to a cellid.
* Converts a characters string into a cellid. The character string must be a
* hexadecimal representation of a valid cellid.
*
* @param cellidstring The string to be converted.
*
* @retval cellid The resulting cellid
* @retval MCA_NS_BASE_CELLID_MAX String could not be converted.
*
* @code
* cellid = ompi_name_server.convert_string_to_cellid(cellidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring);
/**
* Get the virtual process id as a numeric value.
* The get_vpid() function returns the vpid in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval vpid The vpid field of the provided name.
* @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* vpid = ompi_name_server.get_vpid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name);
/**
* Get the job id as a numeric value.
* The get_jobid() function returns the job id in a numeric representation -
@ -507,22 +320,132 @@ typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_p
*/
typedef int (*orte_ns_base_module_get_jobid_fn_t)(orte_jobid_t *jobid, const orte_process_name_t* name);
/**** NAME FUNCTIONS ****/
/**
* Get the cell id as a numberic value.
* The get_cellid() function returns the cell id in a numeric representation -
* i.e., in an integer form.
* Obtain a single new process name.
* The create_process_name() function creates a single process name structure and fills the
* fields with the provided values.
*
* @param *name A pointer to the name structure containing the name.
* @param cell The cell for which the process name is intended. Usually, this is
* the id of the cell where the process is initially planning to be spawned.
* @param job The id of the job to which the process will belong. Process id's are
* tracked according to jobid, but not cellid. Thus, two processes
* can have the same process id if and only if they have different jobid's. However,
* two processes in the same jobid cannot have the same process id, regardless
* of whether or not they are in the same cell.
* @param vpid The virtual process id for the name. Note that no check is made for uniqueness -
* the caller is responsible for ensuring that the requested name is, in fact, unique
* by first requesting reservation of an appropriate range of virtual process id's.
*
* @retval cellid The cell id field of the provided name.
* @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* cellid = ompi_name_server.get_cellid(&name)
* new_name = ompi_name_server.create_process_name(cell, job, vpid);
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name);
typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
/*
* Create my name
* If a process is a singleton, then it needs to create a name for itself. When
* a persistent daemon is present, this requires a communication to that daemon.
* Since the RML uses process names as its index into the RML communicator table,
* the RML automatically assigns a name to each process when it first attempts
* to communicate. This function takes advantage of that behavior to ensure that
* one, and ONLY one, name gets assigned to the process
*/
typedef int (*orte_ns_base_module_create_my_name_fn_t)(void);
/**
* Make a copy of a process name.
* Given a process name, this function creates a copy of it and returns a pointer
* to the duplicate structure.
*
* @param *name Pointer to an existing process name structure.
*
* @retval *newname Pointer to the duplicate structure, with all fields transferred.
* @retval NULL Indicates an error - most likely due to a NULL process name
* pointer being supplied as input.
*/
typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest,
orte_process_name_t* src);
/**
* Convert a string representation to a process name.
* The convert_string_to_process_name() function converts a string representation of a process
* name into an Open MPI name structure. The string must be of the proper form - i.e., it
* must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form.
*
* @param *name_string A character string representation of a process name.
*
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* name = ompi_name_server.convert_string_to_process_name(name_string);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name,
const char* name_string);
/**
* Free (release) a process name.
* The free_name() function releases the process name from the "used" list
* maintained within the name server for the jobid contained in the specified
* name. The memory for the name is also released at that time.
*
* Name values are currently \em not re-used. Hence, free-ing a name
* does not provide any noticeable benefit other than releasing the memory. In
* the future, names may be re-used if this becomes desirable.
*
* @param *name A pointer to the name structure containing the name being released.
*
* @retval OMPI_SUCCESS Indicates the release was succesfully accomplished.
* @retval OMPI_ERROR Indicates the release failed - most likely due to an
* error when free-ing the memory allocation.
*
* @code
* if (OMPI_ERROR == ompi_name_server.free_name(&name) {
* report error
* }
* @endcode
*/
typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name);
/**
* Get the process name as a character string.
* The get_proc_name_string() function returns the entire process name in a
* character string representation. The string is created by expressing each
* field in hexadecimal separated by periods, as follows:
*
* sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid)
*
* The memory required for the string is allocated by the function - releasing
* that allocation is the responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* full name.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* name-string = ompi_name_server.get_proc_name_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
const orte_process_name_t* name);
/**
* Compare two name values.
@ -557,6 +480,81 @@ typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2);
/**** VPID FUNCTIONS ****/
/**
* Get the virtual process id as a character string.
* The get_vpid_string() function returns the vpid in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* vpid.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* vpid-string = ompi_name_server.get_vpid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, const orte_process_name_t* name);
/**
* Convert vpid to character string
* Returns the vpid in a character string representation. The string is created
* by expressing the provided vpid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param vpid The vpid to be converted.
*
* @retval *vpid_string A pointer to a character string representation of the vpid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* vpid-string = ompi_name_server.convert_vpid_to_string(vpid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_vpid_to_string_fn_t)(char **vpid_string, const orte_vpid_t vpid);
/**
* Convert a string to a vpid.
* Converts a characters string into a vpid. The character string must be a
* hexadecimal representation of a valid vpid.
*
* @param vpidstring The string to be converted.
*
* @retval vpid The resulting vpid
* @retval MCA_NS_BASE_VPID_MAX String could not be converted.
*
* @code
* vpid = ompi_name_server.convert_string_to_vpid(vpidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring);
/**
* Get the virtual process id as a numeric value.
* The get_vpid() function returns the vpid in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval vpid The vpid field of the provided name.
* @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* vpid = ompi_name_server.get_vpid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name);
/**** TAG SERVER ****/
/*
* Allocate a tag
* If name is NULL, tag server provides next unique tag but cannot look
@ -565,6 +563,7 @@ typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields,
typedef int (*orte_ns_base_module_assign_rml_tag_fn_t)(orte_rml_tag_t *tag,
char *name);
/**** DATA TYPE SERVER ****/
/* This function defines a new data type and gives it a system-wide unique
* identifier for use in the data packing subsystem. Only called from the
* dps when needing a new identifier.
@ -573,6 +572,8 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)(
const char *name,
orte_data_type_t *type);
/**** PEER RETRIEVAL ****/
/*
* Get my peers
*
@ -582,40 +583,75 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)(
typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs,
size_t *num_procs, size_t *self);
/*
* Get the list of peers from a specified job
*
* THIS FUNCTION MAY BE ELIMINATED IN FUTURE VERSIONS TO REMOVE MULTIPLE STORAGE
* OF O(N) ARRAYS IN THE SYSTEM
*/
typedef int (*orte_ns_base_module_get_job_peers_fn_t)(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
/*
* DIAGNOSTIC INTERFACES
*/
typedef int (*orte_ns_base_module_dump_cells_fn_t)(int output_id);
typedef int (*orte_ns_base_module_dump_jobs_fn_t)(int output_id);
typedef int (*orte_ns_base_module_dump_tags_fn_t)(int output_id);
typedef int (*orte_ns_base_module_dump_datatypes_fn_t)(int output_id);
/*
* Ver 1.0.0
*/
struct mca_ns_base_module_1_0_0_t {
/* init */
orte_ns_base_module_init_fn_t init;
/* cell functions */
orte_ns_base_module_create_cellid_fn_t create_cellid;
orte_ns_base_module_get_cellid_fn_t get_cellid;
orte_ns_base_module_get_cell_info_fn_t get_cell_info;
orte_ns_base_module_assign_cellid_to_process_fn_t assign_cellid_to_process;
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
/* jobid functions */
orte_ns_base_module_create_jobid_fn_t create_jobid;
orte_ns_base_module_get_jobid_fn_t get_jobid;
orte_ns_base_module_get_jobid_string_fn_t get_jobid_string;
orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string;
orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid;
/* vpid functions */
orte_ns_base_module_reserve_range_fn_t reserve_range;
orte_ns_base_module_get_vpid_fn_t get_vpid;
orte_ns_base_module_get_vpid_string_fn_t get_vpid_string;
orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string;
orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid;
/* name functions */
orte_ns_base_module_create_proc_name_fn_t create_process_name;
orte_ns_base_module_create_my_name_fn_t create_my_name;
orte_ns_base_module_copy_proc_name_fn_t copy_process_name;
orte_ns_base_module_convert_string_to_process_name_fn_t convert_string_to_process_name;
orte_ns_base_module_reserve_range_fn_t reserve_range;
orte_ns_base_module_free_name_fn_t free_name;
orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string;
orte_ns_base_module_get_vpid_string_fn_t get_vpid_string;
orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string;
orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid;
orte_ns_base_module_get_jobid_string_fn_t get_jobid_string;
orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string;
orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid;
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
orte_ns_base_module_get_vpid_fn_t get_vpid;
orte_ns_base_module_get_jobid_fn_t get_jobid;
orte_ns_base_module_get_cellid_fn_t get_cellid;
orte_ns_base_module_compare_fn_t compare;
orte_ns_base_module_derive_vpid_fn_t derive_vpid;
orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag;
orte_ns_base_module_define_data_type_fn_t define_data_type;
/* peer functions */
orte_ns_base_module_get_peers_fn_t get_peers;
orte_ns_base_module_get_job_peers_fn_t get_job_peers;
/* tag server functions */
orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag;
/* data type functions */
orte_ns_base_module_define_data_type_fn_t define_data_type;
/* diagnostic functions */
orte_ns_base_module_dump_cells_fn_t dump_cells;
orte_ns_base_module_dump_jobs_fn_t dump_jobs;
orte_ns_base_module_dump_tags_fn_t dump_tags;
orte_ns_base_module_dump_datatypes_fn_t dump_datatypes;
};
typedef struct mca_ns_base_module_1_0_0_t mca_ns_base_module_1_0_0_t;

Просмотреть файл

@ -21,12 +21,13 @@
#include <string.h>
#include "include/orte_constants.h"
#include "include/orte_types.h"
#include "mca/mca.h"
#include "dps/dps.h"
#include "mca/errmgr/errmgr.h"
#include "mca/rml/rml.h"
#include "orte/include/orte_constants.h"
#include "orte/include/orte_types.h"
#include "opal/mca/mca.h"
#include "opal/util/output.h"
#include "orte/dps/dps.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "ns_proxy.h"
@ -43,19 +44,13 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
size_t count;
size_t count, index;
int rc;
orte_ns_proxy_cell_info_t *cptr;
orte_ns_proxy_cell_info_t *new_cell;
/* set the default value of error */
*cellid = ORTE_CELLID_MAX;
/* check for errors */
if (NULL == site || NULL == resource) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
command = ORTE_NS_CREATE_CELLID_CMD;
cmd = OBJ_NEW(orte_buffer_t);
@ -82,7 +77,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -95,7 +90,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
@ -123,17 +118,31 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
OBJ_RELEASE(answer);
/* store the info locally for later retrieval */
cptr = OBJ_NEW(orte_ns_proxy_cell_info_t);
if (NULL == cptr) {
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
new_cell = OBJ_NEW(orte_ns_proxy_cell_info_t);
if (NULL == new_cell) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_proxy.cells, new_cell))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (NULL != site) {
new_cell->site = strdup(site);
}
if (NULL != resource) {
new_cell->resource = strdup(resource);
}
new_cell->cellid = orte_ns_proxy.num_cells;
*cellid = new_cell->cellid;
(orte_ns_proxy.num_cells)++;
cptr->cellid = *cellid;
cptr->site = strdup(site);
cptr->resource = strdup(resource);
opal_list_append(&orte_ns_proxy_cell_info_list, &cptr->item);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
@ -141,23 +150,29 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
char **site, char **resource)
{
opal_list_item_t *item;
orte_ns_proxy_cell_info_t *cell;
size_t i, j;
orte_ns_proxy_cell_info_t **cell;
*site = NULL;
*resource = NULL;
for (item = opal_list_get_first(&orte_ns_proxy_cell_info_list);
item != opal_list_get_end(&orte_ns_proxy_cell_info_list);
item = opal_list_get_next(item)) {
cell = (orte_ns_proxy_cell_info_t*)item;
if (cellid == cell->cellid) {
*site = strdup(cell->site);
*resource = strdup(cell->resource);
return ORTE_SUCCESS;
}
}
return ORTE_ERR_NOT_FOUND;
/* see if we already have the info locally */
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
cell = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_cells &&
i < (orte_ns_proxy.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cellid) {
*site = strdup(cell[i]->site);
*resource = strdup(cell[i]->resource);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
}
}
/* okay, don't have it locally - go ask for it */
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
int orte_ns_proxy_create_jobid(orte_jobid_t *job)
@ -183,7 +198,7 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job)
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -195,7 +210,7 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job)
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
@ -261,7 +276,7 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -274,7 +289,7 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
@ -299,32 +314,122 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
}
/*
* PEER functions
*/
int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
size_t count;
int rc;
/* set default value */
*procs = NULL;
*num_procs = 0;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_GET_JOB_PEERS_CMD;
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_GET_JOB_PEERS_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &num_procs, &count, ORTE_SIZE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* allocate space for array of proc names */
if (0 < *num_procs) {
*procs = (orte_process_name_t*)malloc((*num_procs) * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, procs, num_procs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
char *name)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_ns_proxy_tagitem_t* tagitem;
size_t count;
orte_ns_proxy_tagitem_t* tagitem, **tags;
size_t count, i, j;
int rc;
OPAL_THREAD_LOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
if (NULL != name) {
/* first, check to see if name is already on local list
* if so, return tag
*/
for (tagitem = (orte_ns_proxy_tagitem_t*)opal_list_get_first(&orte_ns_proxy_taglist);
tagitem != (orte_ns_proxy_tagitem_t*)opal_list_get_end(&orte_ns_proxy_taglist);
tagitem = (orte_ns_proxy_tagitem_t*)opal_list_get_next(tagitem)) {
if (0 == strcmp(name, tagitem->name)) { /* found name on list */
*tag = tagitem->tag;
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
return ORTE_SUCCESS;
/* see if this name is already in list - if so, return tag */
tags = (orte_ns_proxy_tagitem_t**)orte_ns_proxy.tags->addr;
for (i=0, j=0; j < orte_ns_proxy.num_tags &&
i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != tags[i]) {
j++;
if (tags[i]->name != NULL &&
0 == strcmp(name, tags[i]->name)) { /* found name on list */
*tag = tags[i]->tag;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
}
}
}
}
/* okay, not on local list - so go get one from tag server */
command = ORTE_NS_ASSIGN_OOB_TAG_CMD;
@ -332,14 +437,14 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
@ -350,28 +455,28 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if (0 > (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
@ -379,14 +484,14 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
@ -394,7 +499,7 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, tag, &count, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
@ -404,18 +509,23 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
tagitem = OBJ_NEW(orte_ns_proxy_tagitem_t);
if (NULL == tagitem) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*tag = ORTE_RML_TAG_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_proxy.tags, tagitem))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
tagitem->tag = *tag;
if (NULL != name) {
(orte_ns_proxy.num_tags)++;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
opal_list_append(&orte_ns_proxy_taglist, &tagitem->item);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
/* all done */
return ORTE_SUCCESS;
@ -428,8 +538,8 @@ int orte_ns_proxy_define_data_type(const char *name,
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_ns_proxy_dti_t *dti;
size_t count;
orte_ns_proxy_dti_t **dti, *dtip;
size_t count, i, j;
int rc=ORTE_SUCCESS;
if (NULL == name || 0 < *type) {
@ -437,20 +547,25 @@ int orte_ns_proxy_define_data_type(const char *name,
return ORTE_ERR_BAD_PARAM;
}
OPAL_THREAD_LOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* first, check to see if name is already on local list
* if so, return id, ensure registered with dps
*/
for (dti = (orte_ns_proxy_dti_t*)opal_list_get_first(&orte_ns_proxy_dtlist);
dti != (orte_ns_proxy_dti_t*)opal_list_get_end(&orte_ns_proxy_dtlist);
dti = (orte_ns_proxy_dti_t*)opal_list_get_next(dti)) {
if (0 == strcmp(name, dti->name)) { /* found name on list */
*type = dti->id;
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
return ORTE_SUCCESS;
dti = (orte_ns_proxy_dti_t**)orte_ns_proxy.dts->addr;
for (i=0, j=0; j < orte_ns_proxy.num_dts &&
i < orte_ns_proxy.dts->size; i++) {
if (NULL != dti[i]) {
j++;
if (dti[i]->name != NULL &&
0 == strcmp(name, dti[i]->name)) { /* found name on list */
*type = dti[i]->id;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
}
}
/* okay, not on local list - so go get one from tag server */
command = ORTE_NS_DEFINE_DATA_TYPE_CMD;
@ -458,42 +573,42 @@ int orte_ns_proxy_define_data_type(const char *name,
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) {
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
@ -501,14 +616,14 @@ int orte_ns_proxy_define_data_type(const char *name,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
@ -516,23 +631,29 @@ int orte_ns_proxy_define_data_type(const char *name,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, type, &count, ORTE_DATA_TYPE))) {
ORTE_ERROR_LOG(ORTE_ERR_UNPACK_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_UNPACK_FAILURE;
}
OBJ_RELEASE(answer);
/* add the new id to the local list so we don't have to get it again */
dti = OBJ_NEW(orte_ns_proxy_dti_t);
if (NULL == dti) { /* out of memory */
dtip = OBJ_NEW(orte_ns_proxy_dti_t);
if (NULL == dtip) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*type = ORTE_DPS_ID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dti->id = *type;
dti->name = strdup(name);
opal_list_append(&orte_ns_proxy_taglist, &dti->item);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
dtip->name = strdup(name);
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_proxy.dts, dtip))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
dtip->id = *type;
(orte_ns_proxy.num_dts)++;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
/* all done */
return rc;
@ -565,7 +686,7 @@ int orte_ns_proxy_create_my_name(void)
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
@ -575,3 +696,235 @@ int orte_ns_proxy_create_my_name(void)
return ORTE_SUCCESS;
}
/*
* DIAGNOSTIC functions
*/
int orte_ns_proxy_dump_cells(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
size_t i, j;
orte_ns_proxy_cell_info_t **ptr;
int rc;
command = ORTE_NS_DUMP_CELLS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica cell tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local cell tracker */
opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Cell Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_cells &&
i < (orte_ns_proxy.cells)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(output_id, "Num: %lu\tCell: %lu\n",
(unsigned long)j, (unsigned long)ptr[i]->cellid);
}
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_jobs(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
int rc;
command = ORTE_NS_DUMP_JOBIDS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica jobid tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_tags(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
size_t i, j;
orte_ns_proxy_tagitem_t **ptr;
int rc;
command = ORTE_NS_DUMP_TAGS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica tag tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local tag tracker */
opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_tags &&
i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(output_id, "Num: %lu\tTag: %lu\tTag name: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
}
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_datatypes(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
size_t i, j;
orte_ns_proxy_dti_t **ptr;
int rc;
command = ORTE_NS_DUMP_DATATYPES_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica datatype tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local datatype tracker */
opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_dts &&
i < (orte_ns_proxy.dts)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(output_id, "Num: %lu\tDatatype id: %lu\tDatatype name: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -32,7 +32,7 @@ extern "C" {
#endif
struct orte_ns_proxy_cell_info_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
opal_object_t super;
orte_cellid_t cellid;
char *site;
char *resource;
@ -42,7 +42,7 @@ typedef struct orte_ns_proxy_cell_info_t orte_ns_proxy_cell_info_t;
OBJ_CLASS_DECLARATION(orte_ns_proxy_cell_info_t);
struct orte_ns_proxy_tagitem_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
opal_object_t super;
orte_rml_tag_t tag; /**< OOB tag */
char *name; /**< Name associated with tag */
};
@ -51,7 +51,7 @@ typedef struct orte_ns_proxy_tagitem_t orte_ns_proxy_tagitem_t;
OBJ_CLASS_DECLARATION(orte_ns_proxy_tagitem_t);
struct orte_ns_proxy_dti_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
opal_object_t super;
orte_data_type_t id; /**< data type id */
char *name; /**< Name associated with data type */
};
@ -77,13 +77,20 @@ int orte_ns_proxy_finalize(void);
/*
* globals used within proxy component
*/
typedef struct {
size_t max_size, block_size;
orte_process_name_t *my_replica;
int debug;
orte_cellid_t num_cells;
orte_pointer_array_t *cells;
orte_pointer_array_t *tags;
orte_rml_tag_t num_tags;
orte_pointer_array_t *dts;
orte_data_type_t num_dts;
opal_mutex_t mutex;
} orte_ns_proxy_globals_t;
extern orte_process_name_t *orte_ns_my_replica;
extern int orte_ns_proxy_debug;
extern opal_list_t orte_ns_proxy_cell_info_list;
extern opal_list_t orte_ns_proxy_taglist;
extern opal_list_t orte_ns_proxy_dtlist;
extern opal_mutex_t orte_ns_proxy_mutex;
extern orte_ns_proxy_globals_t orte_ns_proxy;
/*
* proxy function prototypes
@ -97,6 +104,9 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *jobid);
int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_vpid_t *startvpid);
int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, char *name);
int orte_ns_proxy_define_data_type(const char *name,
@ -104,6 +114,19 @@ int orte_ns_proxy_define_data_type(const char *name,
int orte_ns_proxy_create_my_name(void);
/*
* Diagnostic functions
*/
int orte_ns_proxy_dump_cells(int output_id);
int orte_ns_proxy_dump_jobs(int output_id);
int orte_ns_proxy_dump_tags(int output_id);
int orte_ns_proxy_dump_datatypes(int output_id);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -64,36 +64,49 @@ OMPI_COMP_EXPORT mca_ns_base_component_t mca_ns_proxy_component = {
/*
* setup the function pointers for the module
*/
static mca_ns_base_module_t orte_ns_proxy = {
static mca_ns_base_module_t orte_ns_proxy_module = {
/* init */
orte_ns_proxy_module_init,
/* cell functions */
orte_ns_proxy_create_cellid,
orte_ns_base_get_cellid,
orte_ns_proxy_get_cell_info,
orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* jobid functions */
orte_ns_proxy_create_jobid,
orte_ns_base_get_jobid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_proxy_reserve_range,
orte_ns_base_get_vpid,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name,
orte_ns_proxy_create_my_name,
orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name,
orte_ns_proxy_reserve_range,
orte_ns_base_free_name,
orte_ns_base_get_proc_name_string,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
orte_ns_base_get_vpid,
orte_ns_base_get_jobid,
orte_ns_base_get_cellid,
orte_ns_base_compare,
orte_ns_base_derive_vpid,
/* peer functions */
orte_ns_base_get_peers,
orte_ns_proxy_get_job_peers,
/* tag server functions */
orte_ns_proxy_assign_rml_tag,
/* data type functions */
orte_ns_proxy_define_data_type,
orte_ns_base_get_peers
/* diagnostic functions */
orte_ns_proxy_dump_cells,
orte_ns_proxy_dump_jobs,
orte_ns_proxy_dump_tags,
orte_ns_proxy_dump_datatypes
};
/*
@ -122,7 +135,7 @@ static void orte_ns_proxy_cell_info_destructor(orte_ns_proxy_cell_info_t* ptr)
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_proxy_cell_info_t, /* type name */
opal_list_item_t, /* parent "class" name */
opal_object_t, /* parent "class" name */
orte_ns_proxy_cell_info_construct, /* constructor */
orte_ns_proxy_cell_info_destructor); /* destructor */
@ -144,7 +157,7 @@ static void orte_ns_proxy_tagitem_destructor(orte_ns_proxy_tagitem_t* tagitem)
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_proxy_tagitem_t, /* type name */
opal_list_item_t, /* parent "class" name */
opal_object_t, /* parent "class" name */
orte_ns_proxy_tagitem_construct, /* constructor */
orte_ns_proxy_tagitem_destructor); /* destructor */
@ -166,7 +179,7 @@ static void orte_ns_proxy_dti_destructor(orte_ns_proxy_dti_t* dti)
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_proxy_dti_t, /* type name */
opal_list_item_t, /* parent "class" name */
opal_object_t, /* parent "class" name */
orte_ns_proxy_dti_construct, /* constructor */
orte_ns_proxy_dti_destructor); /* destructor */
@ -174,22 +187,28 @@ OBJ_CLASS_INSTANCE(
* globals needed within proxy component
*/
orte_process_name_t* orte_ns_my_replica=NULL;
int orte_ns_proxy_debug=0;
opal_list_t orte_ns_proxy_cell_info_list;
opal_list_t orte_ns_proxy_taglist;
opal_list_t orte_ns_proxy_dtlist;
opal_mutex_t orte_ns_proxy_mutex;
orte_ns_proxy_globals_t orte_ns_proxy;
/*
* Open the proxy component and obtain the name of my replica.
* Open the proxy component and obtain the name of my proxy.
*/
int orte_ns_proxy_open(void)
{
int id;
int id, param;
id = mca_base_param_register_int("ns", "proxy", "debug", NULL, 0);
mca_base_param_lookup_int(id, &orte_ns_proxy_debug);
mca_base_param_lookup_int(id, &orte_ns_proxy.debug);
id = mca_base_param_register_int("ns", "proxy", "maxsize", NULL,
ORTE_NS_ARRAY_MAX_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_proxy.max_size = (size_t)param;
id = mca_base_param_register_int("ns", "proxy", "blocksize", NULL,
ORTE_NS_ARRAY_BLOCK_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_proxy.block_size = (size_t)param;
return ORTE_SUCCESS;
}
@ -205,9 +224,9 @@ int orte_ns_proxy_close(void)
mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
{
orte_process_name_t name;
int ret;
int ret, rc;
/* If we are NOT to host a replica, then we want to be selected, so do all
/* If we are NOT to host a proxy, then we want to be selected, so do all
the setup and return the module */
/* opal_output(mca_ns_base_output, "ns_proxy: entered init\n"); */
if (NULL != orte_process_info.ns_replica_uri) {
@ -219,7 +238,7 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
*priority = 10;
/* define the replica for us to use */
/* define the proxy for us to use */
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.ns_replica_uri, &name, NULL))) {
ORTE_ERROR_LOG(ret);
return NULL;
@ -228,24 +247,51 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
ORTE_ERROR_LOG(ret);
return NULL;
}
if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_my_replica,
if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_proxy.my_replica,
orte_process_info.ns_replica)) { /* can't operate */
return NULL;
}
/* initialize the cell info list */
OBJ_CONSTRUCT(&orte_ns_proxy_cell_info_list, opal_list_t);
/* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.cells),
orte_ns_proxy.block_size,
orte_ns_proxy.max_size,
orte_ns_proxy.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_proxy.num_cells = 0;
/* initialize the taglist */
OBJ_CONSTRUCT(&orte_ns_proxy_taglist, opal_list_t);
/* initialize the taglist */
/* initialize the dtlist */
OBJ_CONSTRUCT(&orte_ns_proxy_dtlist, opal_list_t);
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.tags),
orte_ns_proxy.block_size,
orte_ns_proxy.max_size,
orte_ns_proxy.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_proxy.num_tags = 0;
/* initialize the dtlist */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.dts),
orte_ns_proxy.block_size,
orte_ns_proxy.max_size,
orte_ns_proxy.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_proxy.num_dts = 0;
/* setup the thread lock */
OBJ_CONSTRUCT(&orte_ns_proxy.mutex, opal_mutex_t);
/* Return the module */
initialized = true;
return &orte_ns_proxy;
return &orte_ns_proxy_module;
} else {
return NULL;
@ -267,24 +313,34 @@ int orte_ns_proxy_module_init(void)
*/
int orte_ns_proxy_finalize(void)
{
orte_ns_proxy_tagitem_t *tagitem;
orte_ns_proxy_cell_info_t *cptr;
orte_ns_proxy_cell_info_t **cptr;
orte_ns_proxy_tagitem_t **tag;
orte_ns_proxy_dti_t **dti;
size_t i;
if (orte_ns_proxy_debug) {
opal_output(0, "finalizing ns proxy");
}
/* free the storage, but only if this component was initialized */
/* free all tracking storage, but only if this component was initialized */
if (initialized) {
while (NULL != (cptr = (orte_ns_proxy_cell_info_t*)opal_list_remove_first(&orte_ns_proxy_cell_info_list))) {
OBJ_RELEASE(cptr);
cptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr;
for (i=0; i < (orte_ns_proxy.cells)->size; i++) {
if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
}
OBJ_DESTRUCT(&orte_ns_proxy_cell_info_list);
while (NULL != (tagitem = (orte_ns_proxy_tagitem_t*)opal_list_remove_first(&orte_ns_proxy_taglist))) {
OBJ_RELEASE(tagitem);
OBJ_RELEASE(orte_ns_proxy.cells);
tag = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
for (i=0; i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != tag[i]) OBJ_RELEASE(tag[i]);
}
OBJ_DESTRUCT(&orte_ns_proxy_taglist);
OBJ_RELEASE(orte_ns_proxy.tags);
dti = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr;
for (i=0; i < (orte_ns_proxy.dts)->size; i++) {
if (NULL != dti[i]) OBJ_RELEASE(dti[i]);
}
OBJ_RELEASE(orte_ns_proxy.dts);
initialized = false;
}

Просмотреть файл

@ -32,6 +32,7 @@
/**
* globals
*/
#define NS_REPLICA_MAX_STRING_SIZE 256
/*
* functions
@ -40,225 +41,593 @@
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
{
orte_ns_replica_cell_tracker_t *new_cell;
int rc;
size_t index;
OPAL_THREAD_LOCK(&orte_ns_replica_mutex);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (ORTE_CELLID_MAX > orte_ns_replica_next_cellid) {
*cellid = orte_ns_replica_next_cellid;
orte_ns_replica_next_cellid++;
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
if (NULL == new_cell) {
*cellid = ORTE_CELLID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_cell->cell = *cellid;
new_cell->site = strdup(site);
new_cell->resource = strdup(resource);
opal_list_append(&orte_ns_replica_cell_tracker, &new_cell->item);
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
}
*cellid = ORTE_CELLID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
/* check if cellid is available. NOTE: need to reserve
* ORTE_CELLID_MAX as an invalid value, so can't allow
* num_cells to get there
*/
if (ORTE_CELLID_MAX-2 < orte_ns_replica.num_cells) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
if (NULL == new_cell) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_replica.cells, new_cell))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
new_cell->site = strdup(site);
new_cell->resource = strdup(resource);
new_cell->cell = orte_ns_replica.num_cells;
*cellid = new_cell->cell;
(orte_ns_replica.num_cells)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
char **site, char **resource)
{
opal_list_item_t *item;
orte_ns_replica_cell_tracker_t *cell;
size_t i, j;
orte_ns_replica_cell_tracker_t **cell;
for (item = opal_list_get_first(&orte_ns_replica_cell_tracker);
item != opal_list_get_end(&orte_ns_replica_cell_tracker);
item = opal_list_get_next(item)) {
cell = (orte_ns_replica_cell_tracker_t*)item;
if (cellid == cell->cell) {
*site = strdup(cell->site);
*resource = strdup(cell->resource);
return ORTE_SUCCESS;
}
}
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
*site = strdup(cell[i]->site);
*resource = strdup(cell[i]->resource);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
/*
* JOBID functions
*/
int orte_ns_replica_create_jobid(orte_jobid_t *jobid)
{
orte_ns_replica_name_tracker_t *new_nt;
OPAL_THREAD_LOCK(&orte_ns_replica_mutex);
if (ORTE_JOBID_MAX > orte_ns_replica_next_jobid) {
*jobid = orte_ns_replica_next_jobid;
orte_ns_replica_next_jobid++;
new_nt = OBJ_NEW(orte_ns_replica_name_tracker_t);
if (NULL == new_nt) { /* out of memory */
*jobid = ORTE_JOBID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_nt->job = *jobid;
new_nt->last_used_vpid = 0;
opal_list_append(&orte_ns_replica_name_tracker, &new_nt->item);
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
}
orte_ns_replica_jobid_tracker_t *ptr;
int rc;
size_t index;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*jobid = ORTE_JOBID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *start)
{
orte_ns_replica_name_tracker_t *ptr;
OPAL_THREAD_LOCK(&orte_ns_replica_mutex);
for (ptr = (orte_ns_replica_name_tracker_t*)opal_list_get_first(&orte_ns_replica_name_tracker);
ptr != (orte_ns_replica_name_tracker_t*)opal_list_get_end(&orte_ns_replica_name_tracker);
ptr = (orte_ns_replica_name_tracker_t*)opal_list_get_next(ptr)) {
if (job == ptr->job) { /* found the specified job */
if ((ORTE_VPID_MAX-range) >= ptr->last_used_vpid) { /* requested range available */
*start = ptr->last_used_vpid;
if (0 == job && *start == 0) { /* vpid=0 reserved for job=0 */
*start = 1;
}
ptr->last_used_vpid = *start + range;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
} else { /* range not available */
*start = ORTE_VPID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
/* check if a jobid is available. NOTE: need to
* reserve ORTE_JOBID_MAX as an invalid value, so can't let
* num_jobids get there
*/
if (ORTE_JOBID_MAX-2 < orte_ns_replica.num_jobids) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
ptr = OBJ_NEW(orte_ns_replica_jobid_tracker_t);
if (NULL == ptr) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_replica.jobids, ptr))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ptr);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
/* get here if the job couldn't be found */
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
ptr->jobid = orte_ns_replica.num_jobids;
*jobid = ptr->jobid;
(orte_ns_replica.num_jobids)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_vpid_t *start)
{
orte_ns_replica_jobid_tracker_t **ptr;
size_t j;
orte_jobid_t k;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (j=0, k=0; k < orte_ns_replica.num_jobids &&
j < (orte_ns_replica.jobids)->size; j++) {
if (NULL != ptr[j]) {
k++;
if (job == ptr[j]->jobid) {
goto PROCESS;
}
}
}
/* didn't find the specified jobid - error */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
if ((ORTE_VPID_MAX-range-(ptr[j]->next_vpid)) > 0) {
*start = ptr[j]->next_vpid;
ptr[j]->next_vpid += range;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/* get here if the range isn't available */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
int orte_ns_replica_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job)
{
orte_ns_replica_jobid_tracker_t **ptr;
orte_process_name_t *nptr;
size_t j;
orte_jobid_t k;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (j=0, k=0; k < orte_ns_replica.num_jobids &&
j < (orte_ns_replica.jobids)->size; j++) {
if (NULL != ptr[j]) {
k++;
if (job == ptr[j]->jobid) {
goto PROCESS;
}
}
}
/* didn't find the specified jobid - error */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
/* the field next_vpid contains the value of the next unassigned
* vpid, so the job extends from vpid=0 to that value. create
* an array of process names containing those values
*/
*procs = (orte_process_name_t*)malloc(ptr[j]->next_vpid * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
nptr = *procs;
for (k=0; k < ptr[j]->next_vpid; k++) {
nptr->cellid = 0;
nptr->jobid = job;
nptr->vpid = (orte_vpid_t)k;
}
*num_procs = (size_t)ptr[j]->next_vpid;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* DIAGNOSTIC functions
*/
int orte_ns_replica_dump_cells(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_cell_tracker_t **cell;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n",
(unsigned long)j, (unsigned long)cell[i]->cell);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n",
cell[i]->site, cell[i]->resource);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_jobid_tracker_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Jobid Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (i=0, j=0; j < orte_ns_replica.num_jobids &&
i < (orte_ns_replica.jobids)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tJobid: %lu\tNext vpid: %lu\n",
(unsigned long)j, (unsigned long)ptr[i]->jobid,
(unsigned long)ptr[i]->next_vpid);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_tagitem_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service RML Tag Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0, j=0; j < orte_ns_replica.num_tags &&
i < (orte_ns_replica.tags)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tTag id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_dti_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Datatype Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0, j=0; j < orte_ns_replica.num_dts &&
i < (orte_ns_replica.dts)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tDatatype id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* TAG SERVER functions
*/
int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag,
char *name)
{
orte_ns_replica_tagitem_t *tagitem;
orte_ns_replica_tagitem_t *tagitem, **tags;
size_t i, j;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica_mutex);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (NULL != name) {
/* see if this name is already in list - if so, return tag */
for (tagitem = (orte_ns_replica_tagitem_t*)opal_list_get_first(&orte_ns_replica_taglist);
tagitem != (orte_ns_replica_tagitem_t*)opal_list_get_end(&orte_ns_replica_taglist);
tagitem = (orte_ns_replica_tagitem_t*)opal_list_get_next(tagitem)) {
if (tagitem->name != NULL && 0 == strcmp(name, tagitem->name)) { /* found name on list */
*tag = tagitem->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
tags = (orte_ns_replica_tagitem_t**)orte_ns_replica.tags->addr;
for (i=0, j=0; j < orte_ns_replica.num_tags &&
i < (orte_ns_replica.tags)->size; i++) {
if (NULL != tags[i]) {
j++;
if (tags[i]->name != NULL &&
0 == strcmp(name, tags[i]->name)) { /* found name on list */
*tag = tags[i]->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
}
/* not in list or not provided, so allocate next tag
* first check to see if one available - else error
*/
if (ORTE_RML_TAG_MAX > orte_ns_replica_next_rml_tag) {
/* okay, one available - assign it */
tagitem = OBJ_NEW(orte_ns_replica_tagitem_t);
if (NULL == tagitem) { /* out of memory */
*tag = ORTE_RML_TAG_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
tagitem->tag = orte_ns_replica_next_rml_tag;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
orte_ns_replica_next_rml_tag++;
opal_list_append(&orte_ns_replica_taglist, &tagitem->item);
/* not in list or not provided, so allocate next tag */
*tag = ORTE_RML_TAG_MAX;
*tag = tagitem->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
/* check if tag is available - need to do this since the tag type
* is probably not going to be a size_t, so we cannot just rely
* on the pointer_array's size limits to protect us. NOTE: need to
* reserve ORTE_RML_TAG_MAX as an invalid value, so can't let
* num_tags get there
*/
if (ORTE_RML_TAG_MAX-2 < orte_ns_replica.num_tags) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* no tag available */
*tag = ORTE_RML_TAG_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
tagitem = OBJ_NEW(orte_ns_replica_tagitem_t);
if (NULL == tagitem) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.tags, tagitem))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
tagitem->tag = orte_ns_replica.num_tags;
(orte_ns_replica.num_tags)++;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
*tag = tagitem->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* DATA TYPE SERVER functions
*/
int orte_ns_replica_define_data_type(const char *name,
orte_data_type_t *type)
{
orte_ns_replica_dti_t *dti;
orte_ns_replica_dti_t **dti, *dtip;
size_t i, j;
int rc;
if (NULL == name || 0 < *type) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
OPAL_THREAD_LOCK(&orte_ns_replica_mutex);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* see if this name is already in list - if so, return id */
for (dti = (orte_ns_replica_dti_t*)opal_list_get_first(&orte_ns_replica_dtlist);
dti != (orte_ns_replica_dti_t*)opal_list_get_end(&orte_ns_replica_dtlist);
dti = (orte_ns_replica_dti_t*)opal_list_get_next(dti)) {
if (dti->name != NULL && 0 == strcmp(name, dti->name)) { /* found name on list */
*type = dti->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
dti = (orte_ns_replica_dti_t**)orte_ns_replica.dts->addr;
for (i=0, j=0; j < orte_ns_replica.num_dts &&
i < orte_ns_replica.dts->size; i++) {
if (NULL != dti[i]) {
j++;
if (dti[i]->name != NULL &&
0 == strcmp(name, dti[i]->name)) { /* found name on list */
*type = dti[i]->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
/* not in list or not provided, so allocate next id
* first check to see if one available - else error
*/
if (ORTE_DPS_ID_MAX > orte_ns_replica_next_dti) {
/* okay, one available - assign it */
dti = OBJ_NEW(orte_ns_replica_dti_t);
if (NULL == dti) { /* out of memory */
*type = ORTE_DPS_ID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dti->id = orte_ns_replica_next_dti;
dti->name = strdup(name);
orte_ns_replica_next_dti++;
opal_list_append(&orte_ns_replica_dtlist, &dti->item);
*type = dti->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
}
/* no id available */
/* not in list or not provided, so allocate next id */
*type = ORTE_DPS_ID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
/* check if id is available - need to do this since the data type
* is probably not going to be a size_t, so we cannot just rely
* on the pointer_array's size limits to protect us.
*/
if (ORTE_DPS_ID_MAX-2 < orte_ns_replica.num_dts) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip = OBJ_NEW(orte_ns_replica_dti_t);
if (NULL == dtip) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip->name = strdup(name);
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.dts, dtip))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
dtip->id = orte_ns_replica.num_dts;
(orte_ns_replica.num_dts)++;
*type = dtip->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* NAME functions
*/
int orte_ns_replica_create_my_name(void)
{
orte_jobid_t jobid;

Просмотреть файл

@ -19,13 +19,14 @@
#define NS_REPLICA_H
#include "orte_config.h"
#include "include/types.h"
#include "orte/include/orte_types.h"
#include "include/orte_constants.h"
#include "opal/threads/mutex.h"
#include "opal/class/opal_list.h"
#include "dps/dps.h"
#include "mca/oob/oob_types.h"
#include "mca/ns/base/base.h"
#include "opal/class/opal_object.h"
#include "orte/class/orte_pointer_array.h"
#include "orte/dps/dps.h"
#include "orte/mca/oob/oob_types.h"
#include "orte/mca/ns/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
@ -34,7 +35,7 @@ extern "C" {
/* list class for tracking cellid's
*/
struct orte_ns_replica_cell_tracker_t {
opal_list_item_t item;
opal_object_t super;
orte_cellid_t cell;
char *site;
char *resource;
@ -45,21 +46,21 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_cell_tracker_t);
/*
* list class for tracking vpids/jobid
* This structure is used to create a linked list of jobid-max vpid pairs. Basically, we
* object for tracking vpids/jobids
* This structure is used to track jobid-max vpid pairs. Basically, we
* are tracking the max used vpid for each jobid that has been created.
*/
struct orte_ns_replica_name_tracker_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
orte_jobid_t job; /**< Job id */
orte_vpid_t last_used_vpid; /**< Tracks the vpid last given out */
struct orte_ns_replica_jobid_tracker_t {
opal_object_t super;
orte_jobid_t jobid; /**< Job id */
orte_vpid_t next_vpid;
};
typedef struct orte_ns_replica_name_tracker_t orte_ns_replica_name_tracker_t;
typedef struct orte_ns_replica_jobid_tracker_t orte_ns_replica_jobid_tracker_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_name_tracker_t);
OBJ_CLASS_DECLARATION(orte_ns_replica_jobid_tracker_t);
struct orte_ns_replica_tagitem_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
opal_object_t super;
orte_rml_tag_t tag; /**< OOB tag */
char *name; /**< Name associated with tag */
};
@ -68,7 +69,7 @@ typedef struct orte_ns_replica_tagitem_t orte_ns_replica_tagitem_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_tagitem_t);
struct orte_ns_replica_dti_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
opal_object_t super;
orte_data_type_t id; /**< data type id */
char *name; /**< Name associated with data type */
};
@ -79,16 +80,26 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_dti_t);
/*
* globals needed within component
*/
extern orte_cellid_t orte_ns_replica_next_cellid;
extern orte_jobid_t orte_ns_replica_next_jobid;
extern opal_list_t orte_ns_replica_cell_tracker;
extern opal_list_t orte_ns_replica_name_tracker;
extern orte_rml_tag_t orte_ns_replica_next_rml_tag;
extern orte_data_type_t orte_ns_replica_next_dti;
extern opal_list_t orte_ns_replica_taglist;
extern opal_list_t orte_ns_replica_dtlist;
extern int orte_ns_replica_debug;
extern opal_mutex_t orte_ns_replica_mutex;
typedef struct {
size_t max_size, block_size;
orte_cellid_t num_cells;
orte_pointer_array_t *cells;
#if 0
orte_jobgrp_t num_jobgrps;
orte_pointer_array_t *jobgrps;
#endif
orte_jobid_t num_jobids;
orte_pointer_array_t *jobids;
orte_pointer_array_t *tags;
orte_rml_tag_t num_tags;
orte_pointer_array_t *dts;
orte_data_type_t num_dts;
int debug;
bool isolate;
opal_mutex_t mutex;
} orte_ns_replica_globals_t;
extern orte_ns_replica_globals_t orte_ns_replica;
/*
* Module open / close
@ -135,6 +146,29 @@ int orte_ns_replica_reserve_range(orte_jobid_t job,
orte_vpid_t range,
orte_vpid_t *startvpid);
/*
* Peer functions
*/
int orte_ns_replica_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
/*
* Diagnostic functions
*/
int orte_ns_replica_dump_cells(int output_id);
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer);
int orte_ns_replica_dump_jobs(int output_id);
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer);
int orte_ns_replica_dump_tags(int output_id);
int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer);
int orte_ns_replica_dump_datatypes(int output_id);
int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer);
/*
* Implementation of assign rml tag
*/

Просмотреть файл

@ -67,36 +67,49 @@ OMPI_COMP_EXPORT mca_ns_base_component_t mca_ns_replica_component = {
/*
* setup the function pointers for the module
*/
static mca_ns_base_module_t orte_ns_replica = {
static mca_ns_base_module_t orte_ns_replica_module = {
/* init */
orte_ns_replica_module_init,
/* cell functions */
orte_ns_replica_create_cellid,
orte_ns_base_get_cellid,
orte_ns_replica_get_cell_info,
orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* jobid functions */
orte_ns_replica_create_jobid,
orte_ns_base_get_jobid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_replica_reserve_range,
orte_ns_base_get_vpid,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name,
orte_ns_replica_create_my_name,
orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name,
orte_ns_replica_reserve_range,
orte_ns_base_free_name,
orte_ns_base_get_proc_name_string,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
orte_ns_base_get_vpid,
orte_ns_base_get_jobid,
orte_ns_base_get_cellid,
orte_ns_base_compare,
orte_ns_base_derive_vpid,
/* peer functions */
orte_ns_base_get_peers,
orte_ns_replica_get_job_peers,
/* tag server functions */
orte_ns_replica_assign_rml_tag,
/* data type functions */
orte_ns_replica_define_data_type,
orte_ns_base_get_peers
/* diagnostic functions */
orte_ns_replica_dump_cells,
orte_ns_replica_dump_jobs,
orte_ns_replica_dump_tags,
orte_ns_replica_dump_datatypes
};
/*
@ -123,29 +136,28 @@ static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_cell_tracker_t, /* type name */
opal_list_item_t, /* parent "class" name */
opal_object_t, /* parent "class" name */
orte_ns_replica_cell_tracker_construct, /* constructor */
orte_ns_replica_cell_tracker_destructor); /* destructor */
/* constructor - used to initialize state of name_tracker instance */
static void orte_ns_replica_tracker_construct(orte_ns_replica_name_tracker_t* name_tracker)
/* constructor - used to initialize state of jobid_tracker instance */
static void orte_ns_replica_jobid_tracker_construct(orte_ns_replica_jobid_tracker_t* jobid_tracker)
{
name_tracker->job = 0;
name_tracker->last_used_vpid = 0;
jobid_tracker->jobid = ORTE_JOBID_MAX;
jobid_tracker->next_vpid = 0;
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_tracker_destructor(orte_ns_replica_name_tracker_t* name_tracker)
{
static void orte_ns_replica_jobid_tracker_destructor(orte_ns_replica_jobid_tracker_t* jobid_tracker){
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_name_tracker_t, /* type name */
opal_list_item_t, /* parent "class" name */
orte_ns_replica_tracker_construct, /* constructor */
orte_ns_replica_tracker_destructor); /* destructor */
orte_ns_replica_jobid_tracker_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_jobid_tracker_construct, /* constructor */
orte_ns_replica_jobid_tracker_destructor); /* destructor */
/* constructor - used to initialize state of taglist instance */
@ -166,7 +178,7 @@ static void orte_ns_replica_tagitem_destructor(orte_ns_replica_tagitem_t* tagite
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_tagitem_t, /* type name */
opal_list_item_t, /* parent "class" name */
opal_object_t, /* parent "class" name */
orte_ns_replica_tagitem_construct, /* constructor */
orte_ns_replica_tagitem_destructor); /* destructor */
@ -189,24 +201,14 @@ static void orte_ns_replica_dti_destructor(orte_ns_replica_dti_t* dti)
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(
orte_ns_replica_dti_t, /* type name */
opal_list_item_t, /* parent "class" name */
opal_object_t, /* parent "class" name */
orte_ns_replica_dti_construct, /* constructor */
orte_ns_replica_dti_destructor); /* destructor */
/*
* globals needed within replica component
*/
orte_cellid_t orte_ns_replica_next_cellid;
orte_jobid_t orte_ns_replica_next_jobid;
opal_list_t orte_ns_replica_cell_tracker;
opal_list_t orte_ns_replica_name_tracker;
orte_rml_tag_t orte_ns_replica_next_rml_tag;
orte_data_type_t orte_ns_replica_next_dti;
opal_list_t orte_ns_replica_taglist;
opal_list_t orte_ns_replica_dtlist;
int orte_ns_replica_debug;
opal_mutex_t orte_ns_replica_mutex;
int orte_ns_replica_isolate;
orte_ns_replica_globals_t orte_ns_replica;
/*
* don't really need this function - could just put NULL in the above structure
@ -214,13 +216,28 @@ int orte_ns_replica_isolate;
*/
int orte_ns_replica_open(void)
{
int id;
int id, param;
id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false);
mca_base_param_lookup_int(id, &orte_ns_replica_debug);
mca_base_param_lookup_int(id, &orte_ns_replica.debug);
id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false);
mca_base_param_lookup_int(id, &orte_ns_replica_isolate);
mca_base_param_lookup_int(id, &param);
if (param) {
orte_ns_replica.isolate = true;
} else {
orte_ns_replica.isolate = false;
}
id = mca_base_param_register_int("ns", "replica", "maxsize", NULL,
ORTE_NS_ARRAY_MAX_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.max_size = (size_t)param;
id = mca_base_param_register_int("ns", "replica", "blocksize", NULL,
ORTE_NS_ARRAY_BLOCK_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.block_size = (size_t)param;
return ORTE_SUCCESS;
}
@ -235,16 +252,13 @@ int orte_ns_replica_close(void)
mca_ns_base_module_t* orte_ns_replica_init(int *priority)
{
orte_ns_replica_name_tracker_t *new_nt;
int rc;
/* If we are to host a replica, then we want to be selected, so do all the
setup and return the module */
if (NULL == orte_process_info.ns_replica_uri) {
orte_ns_replica_next_cellid = 0;
orte_ns_replica_next_jobid = 1; /* jobid 0 reserved for universe */
/* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other ns components). If
we're not the seed, then we don't want to be selected, so
@ -252,43 +266,55 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority)
*priority = 50;
/* initialize the cell tracker */
OBJ_CONSTRUCT(&orte_ns_replica_cell_tracker, opal_list_t);
orte_ns_replica_next_cellid = 0;
/* initialize the name tracker */
OBJ_CONSTRUCT(&orte_ns_replica_name_tracker, opal_list_t);
/* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells),
orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_cells = 0;
/* initialize the job id tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.jobids),
orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_jobids = 0;
/* initialize the taglist */
OBJ_CONSTRUCT(&orte_ns_replica_taglist, opal_list_t);
orte_ns_replica_next_rml_tag = ORTE_RML_TAG_DYNAMIC;
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags),
orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_tags = 0;
/* initialize the dtlist */
OBJ_CONSTRUCT(&orte_ns_replica_dtlist, opal_list_t);
orte_ns_replica_next_dti = ORTE_DPS_ID_DYNAMIC;
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts),
orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_dts = 0;
/* setup the thread lock */
OBJ_CONSTRUCT(&orte_ns_replica_mutex, opal_mutex_t);
OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t);
/* setup the "0" job counter - this is the default one that belongs to
* all daemons. Seed must automatically have it.
*/
new_nt = OBJ_NEW(orte_ns_replica_name_tracker_t);
if (NULL == new_nt) { /* out of memory */
return NULL;
}
new_nt->job = 0;
new_nt->last_used_vpid = 0;
opal_list_append(&orte_ns_replica_name_tracker, &new_nt->item);
/* Return the module */
initialized = true;
return &orte_ns_replica;
return &orte_ns_replica_module;
} else {
return NULL;
}
@ -297,7 +323,7 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority)
int orte_ns_replica_module_init(void)
{
int rc;
if (orte_ns_replica_isolate) {
if (orte_ns_replica.isolate) {
return ORTE_SUCCESS;
}
@ -316,32 +342,48 @@ int orte_ns_replica_module_init(void)
*/
int orte_ns_replica_finalize(void)
{
orte_ns_replica_tagitem_t *tagitem;
orte_ns_replica_dti_t *dti;
orte_ns_replica_cell_tracker_t **cptr;
orte_ns_replica_jobid_tracker_t **jptr;
orte_ns_replica_tagitem_t **tag;
orte_ns_replica_dti_t **dti;
size_t i;
if (orte_ns_replica_debug) {
opal_output(0, "finalizing ns replica");
}
/* free all tracking storage, but only if this component was initialized */
if (initialized) {
/* OBJ_DESTRUCT(&orte_ns_replica_name_tracker); */
while (NULL != (tagitem = (orte_ns_replica_tagitem_t*)opal_list_remove_first(&orte_ns_replica_taglist))) {
OBJ_RELEASE(tagitem);
cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0; i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
}
OBJ_DESTRUCT(&orte_ns_replica_taglist);
while (NULL != (dti = (orte_ns_replica_dti_t*)opal_list_remove_first(&orte_ns_replica_dtlist))) {
OBJ_RELEASE(dti);
OBJ_RELEASE(orte_ns_replica.cells);
jptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (i=0; i < (orte_ns_replica.jobids)->size; i++) {
if (NULL != jptr[i]) {
OBJ_RELEASE(jptr[i]);
}
}
OBJ_DESTRUCT(&orte_ns_replica_dtlist);
OBJ_DESTRUCT(&orte_ns_replica_mutex);
OBJ_RELEASE(orte_ns_replica.jobids);
tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0; i < (orte_ns_replica.tags)->size; i++) {
if (NULL != tag[i]) OBJ_RELEASE(tag[i]);
}
OBJ_RELEASE(orte_ns_replica.tags);
dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0; i < (orte_ns_replica.dts)->size; i++) {
if (NULL != dti[i]) OBJ_RELEASE(dti[i]);
}
OBJ_RELEASE(orte_ns_replica.dts);
initialized = false;
}
/* All done */
if (orte_ns_replica_isolate) {
if (orte_ns_replica.isolate) {
return ORTE_SUCCESS;
}
@ -527,7 +569,50 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
case ORTE_NS_CREATE_MY_NAME_CMD:
/* ignore this command */
goto CLEANUP;
break;
case ORTE_NS_DUMP_CELLS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_JOBIDS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_TAGS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_DATATYPES_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
default: