1
1

Add several new interface functions to the name services:

1. dump_xxx - analogous to the registry's dump commands, allows you to examine the contents of the name services' structures

2. get_job_peers - get an array of process names for all processes in the specified job

This commit was SVN r6759.
Этот коммит содержится в:
Ralph Castain 2005-08-07 13:21:52 +00:00
родитель 5208f9001d
Коммит c530521a8e
11 изменённых файлов: 1850 добавлений и 755 удалений

Просмотреть файл

@ -34,10 +34,9 @@ libmca_ns_base_la_SOURCES = \
ns_base_close.c \ ns_base_close.c \
ns_base_select.c \ ns_base_select.c \
ns_base_open.c \ ns_base_open.c \
ns_base_nds.c \
ns_base_local_fns.c \ ns_base_local_fns.c \
data_type_support/ns_data_type_packing_fns.c \ data_type_support/ns_data_type_packing_fns.c \
data_type_support/ns_data_type_unpacking_fns.c data_type_support/ns_data_type_unpacking_fns.c
# Conditionally install the header files # Conditionally install the header files

Просмотреть файл

@ -39,6 +39,9 @@
extern "C" { extern "C" {
#endif #endif
/* default limits */
#define ORTE_NS_ARRAY_MAX_SIZE INT_MAX
#define ORTE_NS_ARRAY_BLOCK_SIZE 512
/* /*
* Internal definitions * Internal definitions
*/ */
@ -55,12 +58,18 @@ typedef uint8_t orte_ns_cmd_bitmask_t;
/* /*
* define flag values for remote commands - only used internally * define flag values for remote commands - only used internally
*/ */
#define ORTE_NS_CREATE_CELLID_CMD 0x01 #define ORTE_NS_CREATE_CELLID_CMD (int8_t)0x01
#define ORTE_NS_CREATE_JOBID_CMD 0x02 #define ORTE_NS_CREATE_JOBID_CMD (int8_t)0x02
#define ORTE_NS_RESERVE_RANGE_CMD 0x04 #define ORTE_NS_RESERVE_RANGE_CMD (int8_t)0x04
#define ORTE_NS_ASSIGN_OOB_TAG_CMD 0x08 #define ORTE_NS_ASSIGN_OOB_TAG_CMD (int8_t)0x08
#define ORTE_NS_DEFINE_DATA_TYPE_CMD 0x10 #define ORTE_NS_GET_JOB_PEERS_CMD (int8_t)0x0A
#define ORTE_NS_CREATE_MY_NAME_CMD 0x20 #define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t)0x10
#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t)0x20
#define ORTE_NS_DUMP_CELLS_CMD (int8_t)0x21
#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t)0x22
#define ORTE_NS_DUMP_TAGS_CMD (int8_t)0x23
#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t)0x24
/* /*
* function definitions * function definitions
@ -119,6 +128,9 @@ OMPI_DECLSPEC int orte_ns_base_compare(orte_ns_cmp_bitmask_t fields,
OMPI_DECLSPEC int orte_ns_base_free_name(orte_process_name_t **name); OMPI_DECLSPEC int orte_ns_base_free_name(orte_process_name_t **name);
OMPI_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer, int output_id);
/* not available functions */ /* not available functions */
OMPI_DECLSPEC int orte_ns_base_module_init_not_available(void); OMPI_DECLSPEC int orte_ns_base_module_init_not_available(void);
@ -146,6 +158,14 @@ OMPI_DECLSPEC int orte_ns_base_define_data_type_not_available(
OMPI_DECLSPEC int orte_ns_base_create_my_name_not_available(void); OMPI_DECLSPEC int orte_ns_base_create_my_name_not_available(void);
OMPI_DECLSPEC int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
OMPI_DECLSPEC int orte_ns_base_dump_cells_not_available(int output_id);
OMPI_DECLSPEC int orte_ns_base_dump_jobs_not_available(int output_id);
OMPI_DECLSPEC int orte_ns_base_dump_tags_not_available(int output_id);
OMPI_DECLSPEC int orte_ns_base_dump_datatypes_not_available(int output_id);
/* Base functions used everywhere */ /* Base functions used everywhere */
OMPI_DECLSPEC int orte_ns_base_get_peers(orte_process_name_t **procs, OMPI_DECLSPEC int orte_ns_base_get_peers(orte_process_name_t **procs,
size_t *num_procs, size_t *self); size_t *num_procs, size_t *self);

Просмотреть файл

@ -111,6 +111,43 @@ orte_ns_base_create_my_name_not_available(void)
return ORTE_ERR_UNREACH; return ORTE_ERR_UNREACH;
} }
int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job)
{
*procs = NULL;
*num_procs = 0;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_cells_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_jobs_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_tags_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_datatypes_not_available(int output_id)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
/* /*
@ -566,3 +603,72 @@ int orte_ns_base_free_name(orte_process_name_t **name)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
int orte_ns_base_get_peers(orte_process_name_t **procs,
size_t *num_procs, size_t *self)
{
size_t i;
int rc;
orte_cellid_t mycellid;
orte_jobid_t myjobid;
orte_vpid_t myvpid;
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs *
sizeof(orte_process_name_t));
if (NULL == *procs) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mycellid, orte_process_info.my_name))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != orte_ns.get_jobid(&myjobid, orte_process_info.my_name)) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != orte_ns.get_vpid(&myvpid, orte_process_info.my_name)) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i=0; i < orte_process_info.num_procs; i++) {
(*procs)[i].cellid = mycellid;
(*procs)[i].jobid = myjobid;
(*procs)[i].vpid = orte_process_info.vpid_start + i;
}
*num_procs = orte_process_info.num_procs;
*self = (size_t)(myvpid - orte_process_info.vpid_start);
return ORTE_SUCCESS;
}
/*
* DIAGNOSTIC FUNCTIONS
*/
int orte_ns_base_print_dump(orte_buffer_t *buffer, int output_id)
{
char *line;
size_t n;
orte_data_type_t type;
int rc;
n = 1;
while (ORTE_SUCCESS == orte_dps.peek(buffer, &type, &n)) {
if (ORTE_SUCCESS !=
(rc = orte_dps.unpack(buffer, &line, &n, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
opal_output(output_id, "%s", line);
free(line);
n=1;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -48,36 +48,50 @@ orte_process_name_t orte_name_all = {ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_
*/ */
int mca_ns_base_output = -1; int mca_ns_base_output = -1;
OMPI_DECLSPEC mca_ns_base_module_t orte_ns = { OMPI_DECLSPEC mca_ns_base_module_t orte_ns = {
/* init */
orte_ns_base_module_init_not_available, orte_ns_base_module_init_not_available,
/* cell functions */
orte_ns_base_create_cellid_not_available, orte_ns_base_create_cellid_not_available,
orte_ns_base_get_cellid,
orte_ns_base_get_cell_info_not_available, orte_ns_base_get_cell_info_not_available,
orte_ns_base_assign_cellid_to_process, orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* jobid functions */
orte_ns_base_create_jobid_not_available, orte_ns_base_create_jobid_not_available,
orte_ns_base_get_jobid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_base_get_vpid_range_not_available,
orte_ns_base_get_vpid,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name, orte_ns_base_create_process_name,
orte_ns_base_create_my_name_not_available, orte_ns_base_create_my_name_not_available,
orte_ns_base_copy_process_name, orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name, orte_ns_base_convert_string_to_process_name,
orte_ns_base_get_vpid_range_not_available,
orte_ns_base_free_name, orte_ns_base_free_name,
orte_ns_base_get_proc_name_string, orte_ns_base_get_proc_name_string,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
orte_ns_base_get_vpid,
orte_ns_base_get_jobid,
orte_ns_base_get_cellid,
orte_ns_base_compare, orte_ns_base_compare,
orte_ns_base_derive_vpid, /* peer functions */
orte_ns_base_get_peers,
orte_ns_base_get_job_peers_not_available,
/* tag server functions */
orte_ns_base_assign_rml_tag_not_available, orte_ns_base_assign_rml_tag_not_available,
/* data type functions */
orte_ns_base_define_data_type_not_available, orte_ns_base_define_data_type_not_available,
orte_ns_base_get_peers /* diagnostic functions */
orte_ns_base_dump_cells_not_available,
orte_ns_base_dump_jobs_not_available,
orte_ns_base_dump_tags_not_available,
orte_ns_base_dump_datatypes_not_available
}; };
bool mca_ns_base_selected = false; bool mca_ns_base_selected = false;
opal_list_t mca_ns_base_components_available; opal_list_t mca_ns_base_components_available;
mca_ns_base_component_t mca_ns_base_selected_component; mca_ns_base_component_t mca_ns_base_selected_component;

Просмотреть файл

@ -54,6 +54,7 @@ extern "C" {
*/ */
typedef int (*orte_ns_base_module_init_fn_t)(void); typedef int (*orte_ns_base_module_init_fn_t)(void);
/**** CELL FUNCTIONS ****/
/** /**
* Create a new cell id. * Create a new cell id.
* The create_cellid() function allocates a new cell id for use by the caller. * The create_cellid() function allocates a new cell id for use by the caller.
@ -122,6 +123,81 @@ typedef int (*orte_ns_base_module_get_cell_info_fn_t)(orte_cellid_t cellid,
*/ */
typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_name_t* name); typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_name_t* name);
/**
* Get the cell id as a character string.
* The get_cellid_string() function returns the cell id in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* cell id.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* cellid-string = ompi_name_server.get_cellid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name);
/**
* Convert cellid to character string
* Returns the cellid in a character string representation. The string is created
* by expressing the provided cellid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param cellid The cellid to be converted.
*
* @retval *cellid_string A pointer to a character string representation of the cellid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* cellid-string = ompi_name_server.convert_cellid_to_string(cellid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid);
/**
* Convert a string to a cellid.
* Converts a characters string into a cellid. The character string must be a
* hexadecimal representation of a valid cellid.
*
* @param cellidstring The string to be converted.
*
* @retval cellid The resulting cellid
* @retval MCA_NS_BASE_CELLID_MAX String could not be converted.
*
* @code
* cellid = ompi_name_server.convert_string_to_cellid(cellidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring);
/**
* Get the cell id as a numberic value.
* The get_cellid() function returns the cell id in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval cellid The cell id field of the provided name.
* @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* cellid = ompi_name_server.get_cellid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name);
/**** JOB ID FUNCTIONS ****/
/** /**
* Create a new job id. * Create a new job id.
* The create_jobid() function allocates a new job id for use by the caller. * The create_jobid() function allocates a new job id for use by the caller.
@ -148,91 +224,6 @@ typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_na
*/ */
typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid); typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid);
/**
* Obtain a single new process name.
* The create_process_name() function creates a single process name structure and fills the
* fields with the provided values.
*
* @param cell The cell for which the process name is intended. Usually, this is
* the id of the cell where the process is initially planning to be spawned.
* @param job The id of the job to which the process will belong. Process id's are
* tracked according to jobid, but not cellid. Thus, two processes
* can have the same process id if and only if they have different jobid's. However,
* two processes in the same jobid cannot have the same process id, regardless
* of whether or not they are in the same cell.
* @param vpid The virtual process id for the name. Note that no check is made for uniqueness -
* the caller is responsible for ensuring that the requested name is, in fact, unique
* by first requesting reservation of an appropriate range of virtual process id's.
*
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* new_name = ompi_name_server.create_process_name(cell, job, vpid);
* @endcode
*/
typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
/*
* Create my name
* If a process is a singleton, then it needs to create a name for itself. When
* a persistent daemon is present, this requires a communication to that daemon.
* Since the RML uses process names as its index into the RML communicator table,
* the RML automatically assigns a name to each process when it first attempts
* to communicate. This function takes advantage of that behavior to ensure that
* one, and ONLY one, name gets assigned to the process
*/
typedef int (*orte_ns_base_module_create_my_name_fn_t)(void);
/**
* Derive a process vpid.
* Given a base vpid and an offset, return the computed equivalent vpid. This function
* is required because the vpid may not be an integer - need to provide a means for
* computing the resulting vpid in case it isn't.
*/
typedef int (*orte_ns_base_module_derive_vpid_fn_t)(orte_vpid_t *vpid,
orte_vpid_t base_vpid,
int offset);
/**
* Make a copy of a process name.
* Given a process name, this function creates a copy of it and returns a pointer
* to the duplicate structure.
*
* @param *name Pointer to an existing process name structure.
*
* @retval *newname Pointer to the duplicate structure, with all fields transferred.
* @retval NULL Indicates an error - most likely due to a NULL process name
* pointer being supplied as input.
*/
typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest,
orte_process_name_t* src);
/**
* Convert a string representation to a process name.
* The convert_string_to_process_name() function converts a string representation of a process
* name into an Open MPI name structure. The string must be of the proper form - i.e., it
* must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form.
*
* @param *name_string A character string representation of a process name.
*
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* name = ompi_name_server.convert_string_to_process_name(name_string);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name,
const char* name_string);
/** /**
* Reserve a range of process id's. * Reserve a range of process id's.
* The reserve_range() function reserves a range of vpid's for the given jobid. * The reserve_range() function reserves a range of vpid's for the given jobid.
@ -255,112 +246,6 @@ typedef int (*orte_ns_base_module_reserve_range_fn_t)(orte_jobid_t job,
orte_vpid_t range, orte_vpid_t range,
orte_vpid_t *startvpid); orte_vpid_t *startvpid);
/**
* Free (release) a process name.
* The free_name() function releases the process name from the "used" list
* maintained within the name server for the jobid contained in the specified
* name. The memory for the name is also released at that time.
*
* Name values are currently \em not re-used. Hence, free-ing a name
* does not provide any noticeable benefit other than releasing the memory. In
* the future, names may be re-used if this becomes desirable.
*
* @param *name A pointer to the name structure containing the name being released.
*
* @retval OMPI_SUCCESS Indicates the release was succesfully accomplished.
* @retval OMPI_ERROR Indicates the release failed - most likely due to an
* error when free-ing the memory allocation.
*
* @code
* if (OMPI_ERROR == ompi_name_server.free_name(&name) {
* report error
* }
* @endcode
*/
typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name);
/**
* Get the process name as a character string.
* The get_proc_name_string() function returns the entire process name in a
* character string representation. The string is created by expressing each
* field in hexadecimal separated by periods, as follows:
*
* sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid)
*
* The memory required for the string is allocated by the function - releasing
* that allocation is the responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* full name.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* name-string = ompi_name_server.get_proc_name_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
const orte_process_name_t* name);
/**
* Get the virtual process id as a character string.
* The get_vpid_string() function returns the vpid in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* vpid.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* vpid-string = ompi_name_server.get_vpid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, const orte_process_name_t* name);
/**
* Convert vpid to character string
* Returns the vpid in a character string representation. The string is created
* by expressing the provided vpid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param vpid The vpid to be converted.
*
* @retval *vpid_string A pointer to a character string representation of the vpid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* vpid-string = ompi_name_server.convert_vpid_to_string(vpid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_vpid_to_string_fn_t)(char **vpid_string, const orte_vpid_t vpid);
/**
* Convert a string to a vpid.
* Converts a characters string into a vpid. The character string must be a
* hexadecimal representation of a valid vpid.
*
* @param vpidstring The string to be converted.
*
* @retval vpid The resulting vpid
* @retval MCA_NS_BASE_VPID_MAX String could not be converted.
*
* @code
* vpid = ompi_name_server.convert_string_to_vpid(vpidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring);
/** /**
* Get the job id as a character string. * Get the job id as a character string.
* The get_jobid_string() function returns the job id in a character string * The get_jobid_string() function returns the job id in a character string
@ -418,78 +303,6 @@ typedef int (*orte_ns_base_module_convert_jobid_to_string_fn_t)(char **jobid_str
*/ */
typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jobid, const char* jobidstring); typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jobid, const char* jobidstring);
/**
* Get the cell id as a character string.
* The get_cellid_string() function returns the cell id in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* cell id.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* cellid-string = ompi_name_server.get_cellid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name);
/**
* Convert cellid to character string
* Returns the cellid in a character string representation. The string is created
* by expressing the provided cellid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param cellid The cellid to be converted.
*
* @retval *cellid_string A pointer to a character string representation of the cellid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* cellid-string = ompi_name_server.convert_cellid_to_string(cellid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid);
/**
* Convert a string to a cellid.
* Converts a characters string into a cellid. The character string must be a
* hexadecimal representation of a valid cellid.
*
* @param cellidstring The string to be converted.
*
* @retval cellid The resulting cellid
* @retval MCA_NS_BASE_CELLID_MAX String could not be converted.
*
* @code
* cellid = ompi_name_server.convert_string_to_cellid(cellidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring);
/**
* Get the virtual process id as a numeric value.
* The get_vpid() function returns the vpid in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval vpid The vpid field of the provided name.
* @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* vpid = ompi_name_server.get_vpid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name);
/** /**
* Get the job id as a numeric value. * Get the job id as a numeric value.
* The get_jobid() function returns the job id in a numeric representation - * The get_jobid() function returns the job id in a numeric representation -
@ -507,22 +320,132 @@ typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_p
*/ */
typedef int (*orte_ns_base_module_get_jobid_fn_t)(orte_jobid_t *jobid, const orte_process_name_t* name); typedef int (*orte_ns_base_module_get_jobid_fn_t)(orte_jobid_t *jobid, const orte_process_name_t* name);
/**** NAME FUNCTIONS ****/
/** /**
* Get the cell id as a numberic value. * Obtain a single new process name.
* The get_cellid() function returns the cell id in a numeric representation - * The create_process_name() function creates a single process name structure and fills the
* i.e., in an integer form. * fields with the provided values.
* *
* @param *name A pointer to the name structure containing the name. * @param cell The cell for which the process name is intended. Usually, this is
* the id of the cell where the process is initially planning to be spawned.
* @param job The id of the job to which the process will belong. Process id's are
* tracked according to jobid, but not cellid. Thus, two processes
* can have the same process id if and only if they have different jobid's. However,
* two processes in the same jobid cannot have the same process id, regardless
* of whether or not they are in the same cell.
* @param vpid The virtual process id for the name. Note that no check is made for uniqueness -
* the caller is responsible for ensuring that the requested name is, in fact, unique
* by first requesting reservation of an appropriate range of virtual process id's.
* *
* @retval cellid The cell id field of the provided name. * @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that * @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name variable provided was NULL. * the name structure.
* *
* @code * @code
* cellid = ompi_name_server.get_cellid(&name) * new_name = ompi_name_server.create_process_name(cell, job, vpid);
* @endcode * @endcode
*/ */
typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name); typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
/*
* Create my name
* If a process is a singleton, then it needs to create a name for itself. When
* a persistent daemon is present, this requires a communication to that daemon.
* Since the RML uses process names as its index into the RML communicator table,
* the RML automatically assigns a name to each process when it first attempts
* to communicate. This function takes advantage of that behavior to ensure that
* one, and ONLY one, name gets assigned to the process
*/
typedef int (*orte_ns_base_module_create_my_name_fn_t)(void);
/**
* Make a copy of a process name.
* Given a process name, this function creates a copy of it and returns a pointer
* to the duplicate structure.
*
* @param *name Pointer to an existing process name structure.
*
* @retval *newname Pointer to the duplicate structure, with all fields transferred.
* @retval NULL Indicates an error - most likely due to a NULL process name
* pointer being supplied as input.
*/
typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest,
orte_process_name_t* src);
/**
* Convert a string representation to a process name.
* The convert_string_to_process_name() function converts a string representation of a process
* name into an Open MPI name structure. The string must be of the proper form - i.e., it
* must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form.
*
* @param *name_string A character string representation of a process name.
*
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* name = ompi_name_server.convert_string_to_process_name(name_string);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name,
const char* name_string);
/**
* Free (release) a process name.
* The free_name() function releases the process name from the "used" list
* maintained within the name server for the jobid contained in the specified
* name. The memory for the name is also released at that time.
*
* Name values are currently \em not re-used. Hence, free-ing a name
* does not provide any noticeable benefit other than releasing the memory. In
* the future, names may be re-used if this becomes desirable.
*
* @param *name A pointer to the name structure containing the name being released.
*
* @retval OMPI_SUCCESS Indicates the release was succesfully accomplished.
* @retval OMPI_ERROR Indicates the release failed - most likely due to an
* error when free-ing the memory allocation.
*
* @code
* if (OMPI_ERROR == ompi_name_server.free_name(&name) {
* report error
* }
* @endcode
*/
typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name);
/**
* Get the process name as a character string.
* The get_proc_name_string() function returns the entire process name in a
* character string representation. The string is created by expressing each
* field in hexadecimal separated by periods, as follows:
*
* sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid)
*
* The memory required for the string is allocated by the function - releasing
* that allocation is the responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* full name.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* name-string = ompi_name_server.get_proc_name_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
const orte_process_name_t* name);
/** /**
* Compare two name values. * Compare two name values.
@ -557,6 +480,81 @@ typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1, const orte_process_name_t* name1,
const orte_process_name_t* name2); const orte_process_name_t* name2);
/**** VPID FUNCTIONS ****/
/**
* Get the virtual process id as a character string.
* The get_vpid_string() function returns the vpid in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* vpid.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* vpid-string = ompi_name_server.get_vpid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, const orte_process_name_t* name);
/**
* Convert vpid to character string
* Returns the vpid in a character string representation. The string is created
* by expressing the provided vpid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param vpid The vpid to be converted.
*
* @retval *vpid_string A pointer to a character string representation of the vpid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* vpid-string = ompi_name_server.convert_vpid_to_string(vpid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_vpid_to_string_fn_t)(char **vpid_string, const orte_vpid_t vpid);
/**
* Convert a string to a vpid.
* Converts a characters string into a vpid. The character string must be a
* hexadecimal representation of a valid vpid.
*
* @param vpidstring The string to be converted.
*
* @retval vpid The resulting vpid
* @retval MCA_NS_BASE_VPID_MAX String could not be converted.
*
* @code
* vpid = ompi_name_server.convert_string_to_vpid(vpidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring);
/**
* Get the virtual process id as a numeric value.
* The get_vpid() function returns the vpid in a numeric representation -
* i.e., in an integer form.
*
* @param *name A pointer to the name structure containing the name.
*
* @retval vpid The vpid field of the provided name.
* @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that
* the name variable provided was NULL.
*
* @code
* vpid = ompi_name_server.get_vpid(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name);
/**** TAG SERVER ****/
/* /*
* Allocate a tag * Allocate a tag
* If name is NULL, tag server provides next unique tag but cannot look * If name is NULL, tag server provides next unique tag but cannot look
@ -565,6 +563,7 @@ typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields,
typedef int (*orte_ns_base_module_assign_rml_tag_fn_t)(orte_rml_tag_t *tag, typedef int (*orte_ns_base_module_assign_rml_tag_fn_t)(orte_rml_tag_t *tag,
char *name); char *name);
/**** DATA TYPE SERVER ****/
/* This function defines a new data type and gives it a system-wide unique /* This function defines a new data type and gives it a system-wide unique
* identifier for use in the data packing subsystem. Only called from the * identifier for use in the data packing subsystem. Only called from the
* dps when needing a new identifier. * dps when needing a new identifier.
@ -573,6 +572,8 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)(
const char *name, const char *name,
orte_data_type_t *type); orte_data_type_t *type);
/**** PEER RETRIEVAL ****/
/* /*
* Get my peers * Get my peers
* *
@ -582,40 +583,75 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)(
typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs, typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs,
size_t *num_procs, size_t *self); size_t *num_procs, size_t *self);
/*
* Get the list of peers from a specified job
*
* THIS FUNCTION MAY BE ELIMINATED IN FUTURE VERSIONS TO REMOVE MULTIPLE STORAGE
* OF O(N) ARRAYS IN THE SYSTEM
*/
typedef int (*orte_ns_base_module_get_job_peers_fn_t)(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
/*
* DIAGNOSTIC INTERFACES
*/
typedef int (*orte_ns_base_module_dump_cells_fn_t)(int output_id);
typedef int (*orte_ns_base_module_dump_jobs_fn_t)(int output_id);
typedef int (*orte_ns_base_module_dump_tags_fn_t)(int output_id);
typedef int (*orte_ns_base_module_dump_datatypes_fn_t)(int output_id);
/* /*
* Ver 1.0.0 * Ver 1.0.0
*/ */
struct mca_ns_base_module_1_0_0_t { struct mca_ns_base_module_1_0_0_t {
/* init */
orte_ns_base_module_init_fn_t init; orte_ns_base_module_init_fn_t init;
/* cell functions */
orte_ns_base_module_create_cellid_fn_t create_cellid; orte_ns_base_module_create_cellid_fn_t create_cellid;
orte_ns_base_module_get_cellid_fn_t get_cellid;
orte_ns_base_module_get_cell_info_fn_t get_cell_info; orte_ns_base_module_get_cell_info_fn_t get_cell_info;
orte_ns_base_module_assign_cellid_to_process_fn_t assign_cellid_to_process; orte_ns_base_module_assign_cellid_to_process_fn_t assign_cellid_to_process;
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
/* jobid functions */
orte_ns_base_module_create_jobid_fn_t create_jobid; orte_ns_base_module_create_jobid_fn_t create_jobid;
orte_ns_base_module_get_jobid_fn_t get_jobid;
orte_ns_base_module_get_jobid_string_fn_t get_jobid_string;
orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string;
orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid;
/* vpid functions */
orte_ns_base_module_reserve_range_fn_t reserve_range;
orte_ns_base_module_get_vpid_fn_t get_vpid;
orte_ns_base_module_get_vpid_string_fn_t get_vpid_string;
orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string;
orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid;
/* name functions */
orte_ns_base_module_create_proc_name_fn_t create_process_name; orte_ns_base_module_create_proc_name_fn_t create_process_name;
orte_ns_base_module_create_my_name_fn_t create_my_name; orte_ns_base_module_create_my_name_fn_t create_my_name;
orte_ns_base_module_copy_proc_name_fn_t copy_process_name; orte_ns_base_module_copy_proc_name_fn_t copy_process_name;
orte_ns_base_module_convert_string_to_process_name_fn_t convert_string_to_process_name; orte_ns_base_module_convert_string_to_process_name_fn_t convert_string_to_process_name;
orte_ns_base_module_reserve_range_fn_t reserve_range;
orte_ns_base_module_free_name_fn_t free_name; orte_ns_base_module_free_name_fn_t free_name;
orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string; orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string;
orte_ns_base_module_get_vpid_string_fn_t get_vpid_string;
orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string;
orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid;
orte_ns_base_module_get_jobid_string_fn_t get_jobid_string;
orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string;
orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid;
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
orte_ns_base_module_get_vpid_fn_t get_vpid;
orte_ns_base_module_get_jobid_fn_t get_jobid;
orte_ns_base_module_get_cellid_fn_t get_cellid;
orte_ns_base_module_compare_fn_t compare; orte_ns_base_module_compare_fn_t compare;
orte_ns_base_module_derive_vpid_fn_t derive_vpid; /* peer functions */
orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag;
orte_ns_base_module_define_data_type_fn_t define_data_type;
orte_ns_base_module_get_peers_fn_t get_peers; orte_ns_base_module_get_peers_fn_t get_peers;
orte_ns_base_module_get_job_peers_fn_t get_job_peers;
/* tag server functions */
orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag;
/* data type functions */
orte_ns_base_module_define_data_type_fn_t define_data_type;
/* diagnostic functions */
orte_ns_base_module_dump_cells_fn_t dump_cells;
orte_ns_base_module_dump_jobs_fn_t dump_jobs;
orte_ns_base_module_dump_tags_fn_t dump_tags;
orte_ns_base_module_dump_datatypes_fn_t dump_datatypes;
}; };
typedef struct mca_ns_base_module_1_0_0_t mca_ns_base_module_1_0_0_t; typedef struct mca_ns_base_module_1_0_0_t mca_ns_base_module_1_0_0_t;

Просмотреть файл

@ -21,12 +21,13 @@
#include <string.h> #include <string.h>
#include "include/orte_constants.h" #include "orte/include/orte_constants.h"
#include "include/orte_types.h" #include "orte/include/orte_types.h"
#include "mca/mca.h" #include "opal/mca/mca.h"
#include "dps/dps.h" #include "opal/util/output.h"
#include "mca/errmgr/errmgr.h" #include "orte/dps/dps.h"
#include "mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "ns_proxy.h" #include "ns_proxy.h"
@ -43,19 +44,13 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
orte_buffer_t* cmd; orte_buffer_t* cmd;
orte_buffer_t* answer; orte_buffer_t* answer;
orte_ns_cmd_flag_t command; orte_ns_cmd_flag_t command;
size_t count; size_t count, index;
int rc; int rc;
orte_ns_proxy_cell_info_t *cptr; orte_ns_proxy_cell_info_t *new_cell;
/* set the default value of error */ /* set the default value of error */
*cellid = ORTE_CELLID_MAX; *cellid = ORTE_CELLID_MAX;
/* check for errors */
if (NULL == site || NULL == resource) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
command = ORTE_NS_CREATE_CELLID_CMD; command = ORTE_NS_CREATE_CELLID_CMD;
cmd = OBJ_NEW(orte_buffer_t); cmd = OBJ_NEW(orte_buffer_t);
@ -82,7 +77,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
return rc; return rc;
} }
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, MCA_OOB_TAG_NS, 0)) { if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
@ -95,7 +90,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
@ -123,17 +118,31 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
/* store the info locally for later retrieval */ /* store the info locally for later retrieval */
cptr = OBJ_NEW(orte_ns_proxy_cell_info_t); OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
if (NULL == cptr) { new_cell = OBJ_NEW(orte_ns_proxy_cell_info_t);
if (NULL == new_cell) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_proxy.cells, new_cell))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (NULL != site) {
new_cell->site = strdup(site);
}
if (NULL != resource) {
new_cell->resource = strdup(resource);
}
new_cell->cellid = orte_ns_proxy.num_cells;
*cellid = new_cell->cellid;
(orte_ns_proxy.num_cells)++;
cptr->cellid = *cellid; OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
cptr->site = strdup(site);
cptr->resource = strdup(resource);
opal_list_append(&orte_ns_proxy_cell_info_list, &cptr->item);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -141,23 +150,29 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
char **site, char **resource) char **site, char **resource)
{ {
opal_list_item_t *item; size_t i, j;
orte_ns_proxy_cell_info_t *cell; orte_ns_proxy_cell_info_t **cell;
*site = NULL; /* see if we already have the info locally */
*resource = NULL; OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
for (item = opal_list_get_first(&orte_ns_proxy_cell_info_list); cell = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr;
item != opal_list_get_end(&orte_ns_proxy_cell_info_list); for (i=0, j=0; j < orte_ns_proxy.num_cells &&
item = opal_list_get_next(item)) { i < (orte_ns_proxy.cells)->size; i++) {
cell = (orte_ns_proxy_cell_info_t*)item; if (NULL != cell[i]) {
if (cellid == cell->cellid) { j++;
*site = strdup(cell->site); if (cellid == cell[i]->cellid) {
*resource = strdup(cell->resource); *site = strdup(cell[i]->site);
return ORTE_SUCCESS; *resource = strdup(cell[i]->resource);
} OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
} return ORTE_SUCCESS;
return ORTE_ERR_NOT_FOUND; }
}
}
/* okay, don't have it locally - go ask for it */
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
} }
int orte_ns_proxy_create_jobid(orte_jobid_t *job) int orte_ns_proxy_create_jobid(orte_jobid_t *job)
@ -183,7 +198,7 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job)
return rc; return rc;
} }
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
@ -195,7 +210,7 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job)
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
@ -261,7 +276,7 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
return rc; return rc;
} }
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
@ -274,7 +289,7 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
@ -299,32 +314,122 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t
} }
/*
* PEER functions
*/
int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
size_t count;
int rc;
/* set default value */
*procs = NULL;
*num_procs = 0;
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
command = ORTE_NS_GET_JOB_PEERS_CMD;
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_GET_JOB_PEERS_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &num_procs, &count, ORTE_SIZE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
/* allocate space for array of proc names */
if (0 < *num_procs) {
*procs = (orte_process_name_t*)malloc((*num_procs) * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(answer);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, procs, num_procs, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
}
OBJ_RELEASE(answer);
return ORTE_SUCCESS;
}
int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
char *name) char *name)
{ {
orte_buffer_t* cmd; orte_buffer_t* cmd;
orte_buffer_t* answer; orte_buffer_t* answer;
orte_ns_cmd_flag_t command; orte_ns_cmd_flag_t command;
orte_ns_proxy_tagitem_t* tagitem; orte_ns_proxy_tagitem_t* tagitem, **tags;
size_t count; size_t count, i, j;
int rc; int rc;
OPAL_THREAD_LOCK(&orte_ns_proxy_mutex); OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
if (NULL != name) { if (NULL != name) {
/* first, check to see if name is already on local list /* see if this name is already in list - if so, return tag */
* if so, return tag tags = (orte_ns_proxy_tagitem_t**)orte_ns_proxy.tags->addr;
*/ for (i=0, j=0; j < orte_ns_proxy.num_tags &&
for (tagitem = (orte_ns_proxy_tagitem_t*)opal_list_get_first(&orte_ns_proxy_taglist); i < (orte_ns_proxy.tags)->size; i++) {
tagitem != (orte_ns_proxy_tagitem_t*)opal_list_get_end(&orte_ns_proxy_taglist); if (NULL != tags[i]) {
tagitem = (orte_ns_proxy_tagitem_t*)opal_list_get_next(tagitem)) { j++;
if (0 == strcmp(name, tagitem->name)) { /* found name on list */ if (tags[i]->name != NULL &&
*tag = tagitem->tag; 0 == strcmp(name, tags[i]->name)) { /* found name on list */
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); *tag = tags[i]->tag;
return ORTE_SUCCESS; OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
} }
} }
} }
/* okay, not on local list - so go get one from tag server */ /* okay, not on local list - so go get one from tag server */
command = ORTE_NS_ASSIGN_OOB_TAG_CMD; command = ORTE_NS_ASSIGN_OOB_TAG_CMD;
@ -332,14 +437,14 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc; return rc;
} }
@ -350,28 +455,28 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if (0 > (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) { if (0 > (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc; return rc;
} }
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
} }
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
} }
@ -379,14 +484,14 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) { if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc; return rc;
} }
if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
} }
@ -394,7 +499,7 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, tag, &count, ORTE_UINT32))) { if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, tag, &count, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc; return rc;
} }
@ -404,18 +509,23 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag,
tagitem = OBJ_NEW(orte_ns_proxy_tagitem_t); tagitem = OBJ_NEW(orte_ns_proxy_tagitem_t);
if (NULL == tagitem) { /* out of memory */ if (NULL == tagitem) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*tag = ORTE_RML_TAG_MAX; OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_proxy.tags, tagitem))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
tagitem->tag = *tag; tagitem->tag = *tag;
if (NULL != name) { (orte_ns_proxy.num_tags)++;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name); tagitem->name = strdup(name);
} else { } else {
tagitem->name = NULL; tagitem->name = NULL;
} }
opal_list_append(&orte_ns_proxy_taglist, &tagitem->item); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
/* all done */ /* all done */
return ORTE_SUCCESS; return ORTE_SUCCESS;
@ -428,8 +538,8 @@ int orte_ns_proxy_define_data_type(const char *name,
orte_buffer_t* cmd; orte_buffer_t* cmd;
orte_buffer_t* answer; orte_buffer_t* answer;
orte_ns_cmd_flag_t command; orte_ns_cmd_flag_t command;
orte_ns_proxy_dti_t *dti; orte_ns_proxy_dti_t **dti, *dtip;
size_t count; size_t count, i, j;
int rc=ORTE_SUCCESS; int rc=ORTE_SUCCESS;
if (NULL == name || 0 < *type) { if (NULL == name || 0 < *type) {
@ -437,20 +547,25 @@ int orte_ns_proxy_define_data_type(const char *name,
return ORTE_ERR_BAD_PARAM; return ORTE_ERR_BAD_PARAM;
} }
OPAL_THREAD_LOCK(&orte_ns_proxy_mutex); OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* first, check to see if name is already on local list /* first, check to see if name is already on local list
* if so, return id, ensure registered with dps * if so, return id, ensure registered with dps
*/ */
for (dti = (orte_ns_proxy_dti_t*)opal_list_get_first(&orte_ns_proxy_dtlist); dti = (orte_ns_proxy_dti_t**)orte_ns_proxy.dts->addr;
dti != (orte_ns_proxy_dti_t*)opal_list_get_end(&orte_ns_proxy_dtlist); for (i=0, j=0; j < orte_ns_proxy.num_dts &&
dti = (orte_ns_proxy_dti_t*)opal_list_get_next(dti)) { i < orte_ns_proxy.dts->size; i++) {
if (0 == strcmp(name, dti->name)) { /* found name on list */ if (NULL != dti[i]) {
*type = dti->id; j++;
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); if (dti[i]->name != NULL &&
return ORTE_SUCCESS; 0 == strcmp(name, dti[i]->name)) { /* found name on list */
*type = dti[i]->id;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
} }
} }
/* okay, not on local list - so go get one from tag server */ /* okay, not on local list - so go get one from tag server */
command = ORTE_NS_DEFINE_DATA_TYPE_CMD; command = ORTE_NS_DEFINE_DATA_TYPE_CMD;
@ -458,42 +573,42 @@ int orte_ns_proxy_define_data_type(const char *name,
if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc; return rc;
} }
if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) { if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc; return rc;
} }
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
} }
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
} }
@ -501,14 +616,14 @@ int orte_ns_proxy_define_data_type(const char *name,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) { if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc; return rc;
} }
if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
} }
@ -516,23 +631,29 @@ int orte_ns_proxy_define_data_type(const char *name,
if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, type, &count, ORTE_DATA_TYPE))) { if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, type, &count, ORTE_DATA_TYPE))) {
ORTE_ERROR_LOG(ORTE_ERR_UNPACK_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_UNPACK_FAILURE);
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_UNPACK_FAILURE; return ORTE_ERR_UNPACK_FAILURE;
} }
OBJ_RELEASE(answer); OBJ_RELEASE(answer);
/* add the new id to the local list so we don't have to get it again */ /* add the new id to the local list so we don't have to get it again */
dti = OBJ_NEW(orte_ns_proxy_dti_t); dtip = OBJ_NEW(orte_ns_proxy_dti_t);
if (NULL == dti) { /* out of memory */ if (NULL == dtip) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*type = ORTE_DPS_ID_MAX; OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
} }
dti->id = *type; dtip->name = strdup(name);
dti->name = strdup(name); if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
opal_list_append(&orte_ns_proxy_taglist, &dti->item); orte_ns_proxy.dts, dtip))) {
OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
dtip->id = *type;
(orte_ns_proxy.num_dts)++;
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
/* all done */ /* all done */
return rc; return rc;
@ -565,7 +686,7 @@ int orte_ns_proxy_create_my_name(void)
return rc; return rc;
} }
if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, MCA_OOB_TAG_NS, 0)) { if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd); OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE; return ORTE_ERR_COMM_FAILURE;
@ -575,3 +696,235 @@ int orte_ns_proxy_create_my_name(void)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/*
* DIAGNOSTIC functions
*/
int orte_ns_proxy_dump_cells(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
size_t i, j;
orte_ns_proxy_cell_info_t **ptr;
int rc;
command = ORTE_NS_DUMP_CELLS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica cell tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local cell tracker */
opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Cell Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_cells &&
i < (orte_ns_proxy.cells)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(output_id, "Num: %lu\tCell: %lu\n",
(unsigned long)j, (unsigned long)ptr[i]->cellid);
}
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_jobs(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
int rc;
command = ORTE_NS_DUMP_JOBIDS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica jobid tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_tags(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
size_t i, j;
orte_ns_proxy_tagitem_t **ptr;
int rc;
command = ORTE_NS_DUMP_TAGS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica tag tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local tag tracker */
opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_tags &&
i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(output_id, "Num: %lu\tTag: %lu\tTag name: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
}
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_datatypes(int output_id)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
size_t i, j;
orte_ns_proxy_dti_t **ptr;
int rc;
command = ORTE_NS_DUMP_DATATYPES_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica datatype tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
/* dump local datatype tracker */
opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_dts &&
i < (orte_ns_proxy.dts)->size; i++) {
if (NULL != ptr[i]) {
j++;
opal_output(output_id, "Num: %lu\tDatatype id: %lu\tDatatype name: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -32,7 +32,7 @@ extern "C" {
#endif #endif
struct orte_ns_proxy_cell_info_t { struct orte_ns_proxy_cell_info_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */ opal_object_t super;
orte_cellid_t cellid; orte_cellid_t cellid;
char *site; char *site;
char *resource; char *resource;
@ -42,7 +42,7 @@ typedef struct orte_ns_proxy_cell_info_t orte_ns_proxy_cell_info_t;
OBJ_CLASS_DECLARATION(orte_ns_proxy_cell_info_t); OBJ_CLASS_DECLARATION(orte_ns_proxy_cell_info_t);
struct orte_ns_proxy_tagitem_t { struct orte_ns_proxy_tagitem_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */ opal_object_t super;
orte_rml_tag_t tag; /**< OOB tag */ orte_rml_tag_t tag; /**< OOB tag */
char *name; /**< Name associated with tag */ char *name; /**< Name associated with tag */
}; };
@ -51,7 +51,7 @@ typedef struct orte_ns_proxy_tagitem_t orte_ns_proxy_tagitem_t;
OBJ_CLASS_DECLARATION(orte_ns_proxy_tagitem_t); OBJ_CLASS_DECLARATION(orte_ns_proxy_tagitem_t);
struct orte_ns_proxy_dti_t { struct orte_ns_proxy_dti_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */ opal_object_t super;
orte_data_type_t id; /**< data type id */ orte_data_type_t id; /**< data type id */
char *name; /**< Name associated with data type */ char *name; /**< Name associated with data type */
}; };
@ -77,13 +77,20 @@ int orte_ns_proxy_finalize(void);
/* /*
* globals used within proxy component * globals used within proxy component
*/ */
typedef struct {
size_t max_size, block_size;
orte_process_name_t *my_replica;
int debug;
orte_cellid_t num_cells;
orte_pointer_array_t *cells;
orte_pointer_array_t *tags;
orte_rml_tag_t num_tags;
orte_pointer_array_t *dts;
orte_data_type_t num_dts;
opal_mutex_t mutex;
} orte_ns_proxy_globals_t;
extern orte_process_name_t *orte_ns_my_replica; extern orte_ns_proxy_globals_t orte_ns_proxy;
extern int orte_ns_proxy_debug;
extern opal_list_t orte_ns_proxy_cell_info_list;
extern opal_list_t orte_ns_proxy_taglist;
extern opal_list_t orte_ns_proxy_dtlist;
extern opal_mutex_t orte_ns_proxy_mutex;
/* /*
* proxy function prototypes * proxy function prototypes
@ -97,6 +104,9 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *jobid);
int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_vpid_t *startvpid); orte_vpid_t *startvpid);
int orte_ns_proxy_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, char *name); int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, char *name);
int orte_ns_proxy_define_data_type(const char *name, int orte_ns_proxy_define_data_type(const char *name,
@ -104,6 +114,19 @@ int orte_ns_proxy_define_data_type(const char *name,
int orte_ns_proxy_create_my_name(void); int orte_ns_proxy_create_my_name(void);
/*
* Diagnostic functions
*/
int orte_ns_proxy_dump_cells(int output_id);
int orte_ns_proxy_dump_jobs(int output_id);
int orte_ns_proxy_dump_tags(int output_id);
int orte_ns_proxy_dump_datatypes(int output_id);
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
} }
#endif #endif

Просмотреть файл

@ -64,36 +64,49 @@ OMPI_COMP_EXPORT mca_ns_base_component_t mca_ns_proxy_component = {
/* /*
* setup the function pointers for the module * setup the function pointers for the module
*/ */
static mca_ns_base_module_t orte_ns_proxy = { static mca_ns_base_module_t orte_ns_proxy_module = {
/* init */
orte_ns_proxy_module_init, orte_ns_proxy_module_init,
/* cell functions */
orte_ns_proxy_create_cellid, orte_ns_proxy_create_cellid,
orte_ns_base_get_cellid,
orte_ns_proxy_get_cell_info, orte_ns_proxy_get_cell_info,
orte_ns_base_assign_cellid_to_process, orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* jobid functions */
orte_ns_proxy_create_jobid, orte_ns_proxy_create_jobid,
orte_ns_base_get_jobid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_proxy_reserve_range,
orte_ns_base_get_vpid,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name, orte_ns_base_create_process_name,
orte_ns_proxy_create_my_name, orte_ns_proxy_create_my_name,
orte_ns_base_copy_process_name, orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name, orte_ns_base_convert_string_to_process_name,
orte_ns_proxy_reserve_range,
orte_ns_base_free_name, orte_ns_base_free_name,
orte_ns_base_get_proc_name_string, orte_ns_base_get_proc_name_string,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
orte_ns_base_get_vpid,
orte_ns_base_get_jobid,
orte_ns_base_get_cellid,
orte_ns_base_compare, orte_ns_base_compare,
orte_ns_base_derive_vpid, /* peer functions */
orte_ns_base_get_peers,
orte_ns_proxy_get_job_peers,
/* tag server functions */
orte_ns_proxy_assign_rml_tag, orte_ns_proxy_assign_rml_tag,
/* data type functions */
orte_ns_proxy_define_data_type, orte_ns_proxy_define_data_type,
orte_ns_base_get_peers /* diagnostic functions */
orte_ns_proxy_dump_cells,
orte_ns_proxy_dump_jobs,
orte_ns_proxy_dump_tags,
orte_ns_proxy_dump_datatypes
}; };
/* /*
@ -122,7 +135,7 @@ static void orte_ns_proxy_cell_info_destructor(orte_ns_proxy_cell_info_t* ptr)
/* define instance of opal_class_t */ /* define instance of opal_class_t */
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
orte_ns_proxy_cell_info_t, /* type name */ orte_ns_proxy_cell_info_t, /* type name */
opal_list_item_t, /* parent "class" name */ opal_object_t, /* parent "class" name */
orte_ns_proxy_cell_info_construct, /* constructor */ orte_ns_proxy_cell_info_construct, /* constructor */
orte_ns_proxy_cell_info_destructor); /* destructor */ orte_ns_proxy_cell_info_destructor); /* destructor */
@ -144,7 +157,7 @@ static void orte_ns_proxy_tagitem_destructor(orte_ns_proxy_tagitem_t* tagitem)
/* define instance of opal_class_t */ /* define instance of opal_class_t */
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
orte_ns_proxy_tagitem_t, /* type name */ orte_ns_proxy_tagitem_t, /* type name */
opal_list_item_t, /* parent "class" name */ opal_object_t, /* parent "class" name */
orte_ns_proxy_tagitem_construct, /* constructor */ orte_ns_proxy_tagitem_construct, /* constructor */
orte_ns_proxy_tagitem_destructor); /* destructor */ orte_ns_proxy_tagitem_destructor); /* destructor */
@ -166,7 +179,7 @@ static void orte_ns_proxy_dti_destructor(orte_ns_proxy_dti_t* dti)
/* define instance of opal_class_t */ /* define instance of opal_class_t */
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
orte_ns_proxy_dti_t, /* type name */ orte_ns_proxy_dti_t, /* type name */
opal_list_item_t, /* parent "class" name */ opal_object_t, /* parent "class" name */
orte_ns_proxy_dti_construct, /* constructor */ orte_ns_proxy_dti_construct, /* constructor */
orte_ns_proxy_dti_destructor); /* destructor */ orte_ns_proxy_dti_destructor); /* destructor */
@ -174,22 +187,28 @@ OBJ_CLASS_INSTANCE(
* globals needed within proxy component * globals needed within proxy component
*/ */
orte_process_name_t* orte_ns_my_replica=NULL; orte_ns_proxy_globals_t orte_ns_proxy;
int orte_ns_proxy_debug=0;
opal_list_t orte_ns_proxy_cell_info_list;
opal_list_t orte_ns_proxy_taglist;
opal_list_t orte_ns_proxy_dtlist;
opal_mutex_t orte_ns_proxy_mutex;
/* /*
* Open the proxy component and obtain the name of my replica. * Open the proxy component and obtain the name of my proxy.
*/ */
int orte_ns_proxy_open(void) int orte_ns_proxy_open(void)
{ {
int id; int id, param;
id = mca_base_param_register_int("ns", "proxy", "debug", NULL, 0); id = mca_base_param_register_int("ns", "proxy", "debug", NULL, 0);
mca_base_param_lookup_int(id, &orte_ns_proxy_debug); mca_base_param_lookup_int(id, &orte_ns_proxy.debug);
id = mca_base_param_register_int("ns", "proxy", "maxsize", NULL,
ORTE_NS_ARRAY_MAX_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_proxy.max_size = (size_t)param;
id = mca_base_param_register_int("ns", "proxy", "blocksize", NULL,
ORTE_NS_ARRAY_BLOCK_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_proxy.block_size = (size_t)param;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -205,9 +224,9 @@ int orte_ns_proxy_close(void)
mca_ns_base_module_t* orte_ns_proxy_init(int *priority) mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
{ {
orte_process_name_t name; orte_process_name_t name;
int ret; int ret, rc;
/* If we are NOT to host a replica, then we want to be selected, so do all /* If we are NOT to host a proxy, then we want to be selected, so do all
the setup and return the module */ the setup and return the module */
/* opal_output(mca_ns_base_output, "ns_proxy: entered init\n"); */ /* opal_output(mca_ns_base_output, "ns_proxy: entered init\n"); */
if (NULL != orte_process_info.ns_replica_uri) { if (NULL != orte_process_info.ns_replica_uri) {
@ -219,7 +238,7 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
*priority = 10; *priority = 10;
/* define the replica for us to use */ /* define the proxy for us to use */
if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.ns_replica_uri, &name, NULL))) { if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.ns_replica_uri, &name, NULL))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
return NULL; return NULL;
@ -228,24 +247,51 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
return NULL; return NULL;
} }
if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_my_replica, if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_proxy.my_replica,
orte_process_info.ns_replica)) { /* can't operate */ orte_process_info.ns_replica)) { /* can't operate */
return NULL; return NULL;
} }
/* initialize the cell info list */ /* initialize the cell info tracker */
OBJ_CONSTRUCT(&orte_ns_proxy_cell_info_list, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.cells),
orte_ns_proxy.block_size,
orte_ns_proxy.max_size,
orte_ns_proxy.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_proxy.num_cells = 0;
/* initialize the taglist */ /* initialize the taglist */
OBJ_CONSTRUCT(&orte_ns_proxy_taglist, opal_list_t);
/* initialize the dtlist */ if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.tags),
OBJ_CONSTRUCT(&orte_ns_proxy_dtlist, opal_list_t); orte_ns_proxy.block_size,
orte_ns_proxy.max_size,
orte_ns_proxy.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_proxy.num_tags = 0;
/* initialize the dtlist */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.dts),
orte_ns_proxy.block_size,
orte_ns_proxy.max_size,
orte_ns_proxy.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_proxy.num_dts = 0;
/* setup the thread lock */
OBJ_CONSTRUCT(&orte_ns_proxy.mutex, opal_mutex_t);
/* Return the module */ /* Return the module */
initialized = true; initialized = true;
return &orte_ns_proxy; return &orte_ns_proxy_module;
} else { } else {
return NULL; return NULL;
@ -267,24 +313,34 @@ int orte_ns_proxy_module_init(void)
*/ */
int orte_ns_proxy_finalize(void) int orte_ns_proxy_finalize(void)
{ {
orte_ns_proxy_tagitem_t *tagitem; orte_ns_proxy_cell_info_t **cptr;
orte_ns_proxy_cell_info_t *cptr; orte_ns_proxy_tagitem_t **tag;
orte_ns_proxy_dti_t **dti;
size_t i;
if (orte_ns_proxy_debug) { /* free all tracking storage, but only if this component was initialized */
opal_output(0, "finalizing ns proxy");
}
/* free the storage, but only if this component was initialized */
if (initialized) { if (initialized) {
while (NULL != (cptr = (orte_ns_proxy_cell_info_t*)opal_list_remove_first(&orte_ns_proxy_cell_info_list))) { cptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr;
OBJ_RELEASE(cptr); for (i=0; i < (orte_ns_proxy.cells)->size; i++) {
if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
} }
OBJ_DESTRUCT(&orte_ns_proxy_cell_info_list); OBJ_RELEASE(orte_ns_proxy.cells);
while (NULL != (tagitem = (orte_ns_proxy_tagitem_t*)opal_list_remove_first(&orte_ns_proxy_taglist))) {
OBJ_RELEASE(tagitem); tag = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
for (i=0; i < (orte_ns_proxy.tags)->size; i++) {
if (NULL != tag[i]) OBJ_RELEASE(tag[i]);
} }
OBJ_DESTRUCT(&orte_ns_proxy_taglist); OBJ_RELEASE(orte_ns_proxy.tags);
dti = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr;
for (i=0; i < (orte_ns_proxy.dts)->size; i++) {
if (NULL != dti[i]) OBJ_RELEASE(dti[i]);
}
OBJ_RELEASE(orte_ns_proxy.dts);
initialized = false; initialized = false;
} }

Просмотреть файл

@ -32,6 +32,7 @@
/** /**
* globals * globals
*/ */
#define NS_REPLICA_MAX_STRING_SIZE 256
/* /*
* functions * functions
@ -40,225 +41,593 @@
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource) int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
{ {
orte_ns_replica_cell_tracker_t *new_cell; orte_ns_replica_cell_tracker_t *new_cell;
int rc;
size_t index;
OPAL_THREAD_LOCK(&orte_ns_replica_mutex); OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (ORTE_CELLID_MAX > orte_ns_replica_next_cellid) {
*cellid = orte_ns_replica_next_cellid;
orte_ns_replica_next_cellid++;
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
if (NULL == new_cell) {
*cellid = ORTE_CELLID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_cell->cell = *cellid;
new_cell->site = strdup(site);
new_cell->resource = strdup(resource);
opal_list_append(&orte_ns_replica_cell_tracker, &new_cell->item);
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
}
*cellid = ORTE_CELLID_MAX; *cellid = ORTE_CELLID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); /* check if cellid is available. NOTE: need to reserve
return ORTE_ERR_OUT_OF_RESOURCE; * ORTE_CELLID_MAX as an invalid value, so can't allow
* num_cells to get there
*/
if (ORTE_CELLID_MAX-2 < orte_ns_replica.num_cells) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
if (NULL == new_cell) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
orte_ns_replica.cells, new_cell))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
new_cell->site = strdup(site);
new_cell->resource = strdup(resource);
new_cell->cell = orte_ns_replica.num_cells;
*cellid = new_cell->cell;
(orte_ns_replica.num_cells)++;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
} }
int orte_ns_replica_get_cell_info(orte_cellid_t cellid, int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
char **site, char **resource) char **site, char **resource)
{ {
opal_list_item_t *item; size_t i, j;
orte_ns_replica_cell_tracker_t *cell; orte_ns_replica_cell_tracker_t **cell;
for (item = opal_list_get_first(&orte_ns_replica_cell_tracker); OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
item != opal_list_get_end(&orte_ns_replica_cell_tracker);
item = opal_list_get_next(item)) { cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
cell = (orte_ns_replica_cell_tracker_t*)item; for (i=0, j=0; j < orte_ns_replica.num_cells &&
if (cellid == cell->cell) { i < (orte_ns_replica.cells)->size; i++) {
*site = strdup(cell->site); if (NULL != cell[i]) {
*resource = strdup(cell->resource); j++;
return ORTE_SUCCESS; if (cellid == cell[i]->cell) {
} *site = strdup(cell[i]->site);
} *resource = strdup(cell[i]->resource);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
/*
* JOBID functions
*/
int orte_ns_replica_create_jobid(orte_jobid_t *jobid) int orte_ns_replica_create_jobid(orte_jobid_t *jobid)
{ {
orte_ns_replica_name_tracker_t *new_nt; orte_ns_replica_jobid_tracker_t *ptr;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica_mutex); size_t index;
if (ORTE_JOBID_MAX > orte_ns_replica_next_jobid) {
*jobid = orte_ns_replica_next_jobid;
orte_ns_replica_next_jobid++;
new_nt = OBJ_NEW(orte_ns_replica_name_tracker_t);
if (NULL == new_nt) { /* out of memory */
*jobid = ORTE_JOBID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_nt->job = *jobid;
new_nt->last_used_vpid = 0;
opal_list_append(&orte_ns_replica_name_tracker, &new_nt->item);
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
}
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
*jobid = ORTE_JOBID_MAX; *jobid = ORTE_JOBID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); /* check if a jobid is available. NOTE: need to
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); * reserve ORTE_JOBID_MAX as an invalid value, so can't let
return ORTE_ERR_OUT_OF_RESOURCE; * num_jobids get there
} */
if (ORTE_JOBID_MAX-2 < orte_ns_replica.num_jobids) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *start) OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
{ return ORTE_ERR_OUT_OF_RESOURCE;
orte_ns_replica_name_tracker_t *ptr; }
OPAL_THREAD_LOCK(&orte_ns_replica_mutex); ptr = OBJ_NEW(orte_ns_replica_jobid_tracker_t);
if (NULL == ptr) {
for (ptr = (orte_ns_replica_name_tracker_t*)opal_list_get_first(&orte_ns_replica_name_tracker); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
ptr != (orte_ns_replica_name_tracker_t*)opal_list_get_end(&orte_ns_replica_name_tracker); OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
ptr = (orte_ns_replica_name_tracker_t*)opal_list_get_next(ptr)) { return ORTE_ERR_OUT_OF_RESOURCE;
if (job == ptr->job) { /* found the specified job */ }
if ((ORTE_VPID_MAX-range) >= ptr->last_used_vpid) { /* requested range available */ if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index,
*start = ptr->last_used_vpid; orte_ns_replica.jobids, ptr))) {
if (0 == job && *start == 0) { /* vpid=0 reserved for job=0 */ ORTE_ERROR_LOG(rc);
*start = 1; OBJ_RELEASE(ptr);
} OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
ptr->last_used_vpid = *start + range; return rc;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
} else { /* range not available */
*start = ORTE_VPID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
} }
/* get here if the job couldn't be found */ ptr->jobid = orte_ns_replica.num_jobids;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); *jobid = ptr->jobid;
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); (orte_ns_replica.num_jobids)++;
return ORTE_ERR_NOT_FOUND;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
} }
int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range,
orte_vpid_t *start)
{
orte_ns_replica_jobid_tracker_t **ptr;
size_t j;
orte_jobid_t k;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (j=0, k=0; k < orte_ns_replica.num_jobids &&
j < (orte_ns_replica.jobids)->size; j++) {
if (NULL != ptr[j]) {
k++;
if (job == ptr[j]->jobid) {
goto PROCESS;
}
}
}
/* didn't find the specified jobid - error */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
if ((ORTE_VPID_MAX-range-(ptr[j]->next_vpid)) > 0) {
*start = ptr[j]->next_vpid;
ptr[j]->next_vpid += range;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/* get here if the range isn't available */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
int orte_ns_replica_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job)
{
orte_ns_replica_jobid_tracker_t **ptr;
orte_process_name_t *nptr;
size_t j;
orte_jobid_t k;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* find the jobid */
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (j=0, k=0; k < orte_ns_replica.num_jobids &&
j < (orte_ns_replica.jobids)->size; j++) {
if (NULL != ptr[j]) {
k++;
if (job == ptr[j]->jobid) {
goto PROCESS;
}
}
}
/* didn't find the specified jobid - error */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
/* the field next_vpid contains the value of the next unassigned
* vpid, so the job extends from vpid=0 to that value. create
* an array of process names containing those values
*/
*procs = (orte_process_name_t*)malloc(ptr[j]->next_vpid * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
nptr = *procs;
for (k=0; k < ptr[j]->next_vpid; k++) {
nptr->cellid = 0;
nptr->jobid = job;
nptr->vpid = (orte_vpid_t)k;
}
*num_procs = (size_t)ptr[j]->next_vpid;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* DIAGNOSTIC functions
*/
int orte_ns_replica_dump_cells(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_cell_tracker_t **cell;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n",
(unsigned long)j, (unsigned long)cell[i]->cell);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n",
cell[i]->site, cell[i]->resource);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_jobid_tracker_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Jobid Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (i=0, j=0; j < orte_ns_replica.num_jobids &&
i < (orte_ns_replica.jobids)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tJobid: %lu\tNext vpid: %lu\n",
(unsigned long)j, (unsigned long)ptr[i]->jobid,
(unsigned long)ptr[i]->next_vpid);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_tagitem_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service RML Tag Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0, j=0; j < orte_ns_replica.num_tags &&
i < (orte_ns_replica.tags)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tTag id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes(int output_id)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer)
{
size_t i, j;
orte_ns_replica_dti_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Datatype Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0, j=0; j < orte_ns_replica.num_dts &&
i < (orte_ns_replica.dts)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tDatatype id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* TAG SERVER functions
*/
int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag, int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag,
char *name) char *name)
{ {
orte_ns_replica_tagitem_t *tagitem; orte_ns_replica_tagitem_t *tagitem, **tags;
size_t i, j;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica_mutex); OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (NULL != name) { if (NULL != name) {
/* see if this name is already in list - if so, return tag */ /* see if this name is already in list - if so, return tag */
for (tagitem = (orte_ns_replica_tagitem_t*)opal_list_get_first(&orte_ns_replica_taglist); tags = (orte_ns_replica_tagitem_t**)orte_ns_replica.tags->addr;
tagitem != (orte_ns_replica_tagitem_t*)opal_list_get_end(&orte_ns_replica_taglist); for (i=0, j=0; j < orte_ns_replica.num_tags &&
tagitem = (orte_ns_replica_tagitem_t*)opal_list_get_next(tagitem)) { i < (orte_ns_replica.tags)->size; i++) {
if (tagitem->name != NULL && 0 == strcmp(name, tagitem->name)) { /* found name on list */ if (NULL != tags[i]) {
*tag = tagitem->tag; j++;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); if (tags[i]->name != NULL &&
return ORTE_SUCCESS; 0 == strcmp(name, tags[i]->name)) { /* found name on list */
*tag = tags[i]->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
} }
} }
} }
/* not in list or not provided, so allocate next tag /* not in list or not provided, so allocate next tag */
* first check to see if one available - else error *tag = ORTE_RML_TAG_MAX;
*/
if (ORTE_RML_TAG_MAX > orte_ns_replica_next_rml_tag) {
/* okay, one available - assign it */
tagitem = OBJ_NEW(orte_ns_replica_tagitem_t);
if (NULL == tagitem) { /* out of memory */
*tag = ORTE_RML_TAG_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
tagitem->tag = orte_ns_replica_next_rml_tag;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
orte_ns_replica_next_rml_tag++;
opal_list_append(&orte_ns_replica_taglist, &tagitem->item);
*tag = tagitem->tag; /* check if tag is available - need to do this since the tag type
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); * is probably not going to be a size_t, so we cannot just rely
return ORTE_SUCCESS; * on the pointer_array's size limits to protect us. NOTE: need to
* reserve ORTE_RML_TAG_MAX as an invalid value, so can't let
* num_tags get there
*/
if (ORTE_RML_TAG_MAX-2 < orte_ns_replica.num_tags) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
} }
/* no tag available */ tagitem = OBJ_NEW(orte_ns_replica_tagitem_t);
*tag = ORTE_RML_TAG_MAX; if (NULL == tagitem) { /* out of memory */
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE; return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.tags, tagitem))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
tagitem->tag = orte_ns_replica.num_tags;
(orte_ns_replica.num_tags)++;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
*tag = tagitem->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
} }
/*
* DATA TYPE SERVER functions
*/
int orte_ns_replica_define_data_type(const char *name, int orte_ns_replica_define_data_type(const char *name,
orte_data_type_t *type) orte_data_type_t *type)
{ {
orte_ns_replica_dti_t *dti; orte_ns_replica_dti_t **dti, *dtip;
size_t i, j;
int rc;
if (NULL == name || 0 < *type) { if (NULL == name || 0 < *type) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM; return ORTE_ERR_BAD_PARAM;
} }
OPAL_THREAD_LOCK(&orte_ns_replica_mutex); OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* see if this name is already in list - if so, return id */ dti = (orte_ns_replica_dti_t**)orte_ns_replica.dts->addr;
for (dti = (orte_ns_replica_dti_t*)opal_list_get_first(&orte_ns_replica_dtlist); for (i=0, j=0; j < orte_ns_replica.num_dts &&
dti != (orte_ns_replica_dti_t*)opal_list_get_end(&orte_ns_replica_dtlist); i < orte_ns_replica.dts->size; i++) {
dti = (orte_ns_replica_dti_t*)opal_list_get_next(dti)) { if (NULL != dti[i]) {
if (dti->name != NULL && 0 == strcmp(name, dti->name)) { /* found name on list */ j++;
*type = dti->id; if (dti[i]->name != NULL &&
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); 0 == strcmp(name, dti[i]->name)) { /* found name on list */
return ORTE_SUCCESS; *type = dti[i]->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
} }
} }
/* not in list or not provided, so allocate next id /* not in list or not provided, so allocate next id */
* first check to see if one available - else error
*/
if (ORTE_DPS_ID_MAX > orte_ns_replica_next_dti) {
/* okay, one available - assign it */
dti = OBJ_NEW(orte_ns_replica_dti_t);
if (NULL == dti) { /* out of memory */
*type = ORTE_DPS_ID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dti->id = orte_ns_replica_next_dti;
dti->name = strdup(name);
orte_ns_replica_next_dti++;
opal_list_append(&orte_ns_replica_dtlist, &dti->item);
*type = dti->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
return ORTE_SUCCESS;
}
/* no id available */
*type = ORTE_DPS_ID_MAX; *type = ORTE_DPS_ID_MAX;
OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); /* check if id is available - need to do this since the data type
return ORTE_ERR_OUT_OF_RESOURCE; * is probably not going to be a size_t, so we cannot just rely
* on the pointer_array's size limits to protect us.
*/
if (ORTE_DPS_ID_MAX-2 < orte_ns_replica.num_dts) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip = OBJ_NEW(orte_ns_replica_dti_t);
if (NULL == dtip) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip->name = strdup(name);
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.dts, dtip))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
dtip->id = orte_ns_replica.num_dts;
(orte_ns_replica.num_dts)++;
*type = dtip->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
} }
/*
* NAME functions
*/
int orte_ns_replica_create_my_name(void) int orte_ns_replica_create_my_name(void)
{ {
orte_jobid_t jobid; orte_jobid_t jobid;

Просмотреть файл

@ -19,13 +19,14 @@
#define NS_REPLICA_H #define NS_REPLICA_H
#include "orte_config.h" #include "orte_config.h"
#include "include/types.h" #include "orte/include/orte_types.h"
#include "include/orte_constants.h" #include "include/orte_constants.h"
#include "opal/threads/mutex.h" #include "opal/threads/mutex.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_object.h"
#include "dps/dps.h" #include "orte/class/orte_pointer_array.h"
#include "mca/oob/oob_types.h" #include "orte/dps/dps.h"
#include "mca/ns/base/base.h" #include "orte/mca/oob/oob_types.h"
#include "orte/mca/ns/base/base.h"
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
extern "C" { extern "C" {
@ -34,7 +35,7 @@ extern "C" {
/* list class for tracking cellid's /* list class for tracking cellid's
*/ */
struct orte_ns_replica_cell_tracker_t { struct orte_ns_replica_cell_tracker_t {
opal_list_item_t item; opal_object_t super;
orte_cellid_t cell; orte_cellid_t cell;
char *site; char *site;
char *resource; char *resource;
@ -45,21 +46,21 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_cell_tracker_t);
/* /*
* list class for tracking vpids/jobid * object for tracking vpids/jobids
* This structure is used to create a linked list of jobid-max vpid pairs. Basically, we * This structure is used to track jobid-max vpid pairs. Basically, we
* are tracking the max used vpid for each jobid that has been created. * are tracking the max used vpid for each jobid that has been created.
*/ */
struct orte_ns_replica_name_tracker_t { struct orte_ns_replica_jobid_tracker_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */ opal_object_t super;
orte_jobid_t job; /**< Job id */ orte_jobid_t jobid; /**< Job id */
orte_vpid_t last_used_vpid; /**< Tracks the vpid last given out */ orte_vpid_t next_vpid;
}; };
typedef struct orte_ns_replica_name_tracker_t orte_ns_replica_name_tracker_t; typedef struct orte_ns_replica_jobid_tracker_t orte_ns_replica_jobid_tracker_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_name_tracker_t); OBJ_CLASS_DECLARATION(orte_ns_replica_jobid_tracker_t);
struct orte_ns_replica_tagitem_t { struct orte_ns_replica_tagitem_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */ opal_object_t super;
orte_rml_tag_t tag; /**< OOB tag */ orte_rml_tag_t tag; /**< OOB tag */
char *name; /**< Name associated with tag */ char *name; /**< Name associated with tag */
}; };
@ -68,7 +69,7 @@ typedef struct orte_ns_replica_tagitem_t orte_ns_replica_tagitem_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_tagitem_t); OBJ_CLASS_DECLARATION(orte_ns_replica_tagitem_t);
struct orte_ns_replica_dti_t { struct orte_ns_replica_dti_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */ opal_object_t super;
orte_data_type_t id; /**< data type id */ orte_data_type_t id; /**< data type id */
char *name; /**< Name associated with data type */ char *name; /**< Name associated with data type */
}; };
@ -79,16 +80,26 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_dti_t);
/* /*
* globals needed within component * globals needed within component
*/ */
extern orte_cellid_t orte_ns_replica_next_cellid; typedef struct {
extern orte_jobid_t orte_ns_replica_next_jobid; size_t max_size, block_size;
extern opal_list_t orte_ns_replica_cell_tracker; orte_cellid_t num_cells;
extern opal_list_t orte_ns_replica_name_tracker; orte_pointer_array_t *cells;
extern orte_rml_tag_t orte_ns_replica_next_rml_tag; #if 0
extern orte_data_type_t orte_ns_replica_next_dti; orte_jobgrp_t num_jobgrps;
extern opal_list_t orte_ns_replica_taglist; orte_pointer_array_t *jobgrps;
extern opal_list_t orte_ns_replica_dtlist; #endif
extern int orte_ns_replica_debug; orte_jobid_t num_jobids;
extern opal_mutex_t orte_ns_replica_mutex; orte_pointer_array_t *jobids;
orte_pointer_array_t *tags;
orte_rml_tag_t num_tags;
orte_pointer_array_t *dts;
orte_data_type_t num_dts;
int debug;
bool isolate;
opal_mutex_t mutex;
} orte_ns_replica_globals_t;
extern orte_ns_replica_globals_t orte_ns_replica;
/* /*
* Module open / close * Module open / close
@ -135,6 +146,29 @@ int orte_ns_replica_reserve_range(orte_jobid_t job,
orte_vpid_t range, orte_vpid_t range,
orte_vpid_t *startvpid); orte_vpid_t *startvpid);
/*
* Peer functions
*/
int orte_ns_replica_get_job_peers(orte_process_name_t **procs,
size_t *num_procs, orte_jobid_t job);
/*
* Diagnostic functions
*/
int orte_ns_replica_dump_cells(int output_id);
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer);
int orte_ns_replica_dump_jobs(int output_id);
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer);
int orte_ns_replica_dump_tags(int output_id);
int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer);
int orte_ns_replica_dump_datatypes(int output_id);
int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer);
/* /*
* Implementation of assign rml tag * Implementation of assign rml tag
*/ */

Просмотреть файл

@ -67,36 +67,49 @@ OMPI_COMP_EXPORT mca_ns_base_component_t mca_ns_replica_component = {
/* /*
* setup the function pointers for the module * setup the function pointers for the module
*/ */
static mca_ns_base_module_t orte_ns_replica = { static mca_ns_base_module_t orte_ns_replica_module = {
/* init */
orte_ns_replica_module_init, orte_ns_replica_module_init,
/* cell functions */
orte_ns_replica_create_cellid, orte_ns_replica_create_cellid,
orte_ns_base_get_cellid,
orte_ns_replica_get_cell_info, orte_ns_replica_get_cell_info,
orte_ns_base_assign_cellid_to_process, orte_ns_base_assign_cellid_to_process,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* jobid functions */
orte_ns_replica_create_jobid, orte_ns_replica_create_jobid,
orte_ns_base_get_jobid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
/* vpid functions */
orte_ns_replica_reserve_range,
orte_ns_base_get_vpid,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
/* name functions */
orte_ns_base_create_process_name, orte_ns_base_create_process_name,
orte_ns_replica_create_my_name, orte_ns_replica_create_my_name,
orte_ns_base_copy_process_name, orte_ns_base_copy_process_name,
orte_ns_base_convert_string_to_process_name, orte_ns_base_convert_string_to_process_name,
orte_ns_replica_reserve_range,
orte_ns_base_free_name, orte_ns_base_free_name,
orte_ns_base_get_proc_name_string, orte_ns_base_get_proc_name_string,
orte_ns_base_get_vpid_string,
orte_ns_base_convert_vpid_to_string,
orte_ns_base_convert_string_to_vpid,
orte_ns_base_get_jobid_string,
orte_ns_base_convert_jobid_to_string,
orte_ns_base_convert_string_to_jobid,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
orte_ns_base_get_vpid,
orte_ns_base_get_jobid,
orte_ns_base_get_cellid,
orte_ns_base_compare, orte_ns_base_compare,
orte_ns_base_derive_vpid, /* peer functions */
orte_ns_base_get_peers,
orte_ns_replica_get_job_peers,
/* tag server functions */
orte_ns_replica_assign_rml_tag, orte_ns_replica_assign_rml_tag,
/* data type functions */
orte_ns_replica_define_data_type, orte_ns_replica_define_data_type,
orte_ns_base_get_peers /* diagnostic functions */
orte_ns_replica_dump_cells,
orte_ns_replica_dump_jobs,
orte_ns_replica_dump_tags,
orte_ns_replica_dump_datatypes
}; };
/* /*
@ -123,29 +136,28 @@ static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker
/* define instance of opal_class_t */ /* define instance of opal_class_t */
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
orte_ns_replica_cell_tracker_t, /* type name */ orte_ns_replica_cell_tracker_t, /* type name */
opal_list_item_t, /* parent "class" name */ opal_object_t, /* parent "class" name */
orte_ns_replica_cell_tracker_construct, /* constructor */ orte_ns_replica_cell_tracker_construct, /* constructor */
orte_ns_replica_cell_tracker_destructor); /* destructor */ orte_ns_replica_cell_tracker_destructor); /* destructor */
/* constructor - used to initialize state of name_tracker instance */ /* constructor - used to initialize state of jobid_tracker instance */
static void orte_ns_replica_tracker_construct(orte_ns_replica_name_tracker_t* name_tracker) static void orte_ns_replica_jobid_tracker_construct(orte_ns_replica_jobid_tracker_t* jobid_tracker)
{ {
name_tracker->job = 0; jobid_tracker->jobid = ORTE_JOBID_MAX;
name_tracker->last_used_vpid = 0; jobid_tracker->next_vpid = 0;
} }
/* destructor - used to free any resources held by instance */ /* destructor - used to free any resources held by instance */
static void orte_ns_replica_tracker_destructor(orte_ns_replica_name_tracker_t* name_tracker) static void orte_ns_replica_jobid_tracker_destructor(orte_ns_replica_jobid_tracker_t* jobid_tracker){
{
} }
/* define instance of opal_class_t */ /* define instance of opal_class_t */
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
orte_ns_replica_name_tracker_t, /* type name */ orte_ns_replica_jobid_tracker_t, /* type name */
opal_list_item_t, /* parent "class" name */ opal_object_t, /* parent "class" name */
orte_ns_replica_tracker_construct, /* constructor */ orte_ns_replica_jobid_tracker_construct, /* constructor */
orte_ns_replica_tracker_destructor); /* destructor */ orte_ns_replica_jobid_tracker_destructor); /* destructor */
/* constructor - used to initialize state of taglist instance */ /* constructor - used to initialize state of taglist instance */
@ -166,7 +178,7 @@ static void orte_ns_replica_tagitem_destructor(orte_ns_replica_tagitem_t* tagite
/* define instance of opal_class_t */ /* define instance of opal_class_t */
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
orte_ns_replica_tagitem_t, /* type name */ orte_ns_replica_tagitem_t, /* type name */
opal_list_item_t, /* parent "class" name */ opal_object_t, /* parent "class" name */
orte_ns_replica_tagitem_construct, /* constructor */ orte_ns_replica_tagitem_construct, /* constructor */
orte_ns_replica_tagitem_destructor); /* destructor */ orte_ns_replica_tagitem_destructor); /* destructor */
@ -189,24 +201,14 @@ static void orte_ns_replica_dti_destructor(orte_ns_replica_dti_t* dti)
/* define instance of opal_class_t */ /* define instance of opal_class_t */
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
orte_ns_replica_dti_t, /* type name */ orte_ns_replica_dti_t, /* type name */
opal_list_item_t, /* parent "class" name */ opal_object_t, /* parent "class" name */
orte_ns_replica_dti_construct, /* constructor */ orte_ns_replica_dti_construct, /* constructor */
orte_ns_replica_dti_destructor); /* destructor */ orte_ns_replica_dti_destructor); /* destructor */
/* /*
* globals needed within replica component * globals needed within replica component
*/ */
orte_cellid_t orte_ns_replica_next_cellid; orte_ns_replica_globals_t orte_ns_replica;
orte_jobid_t orte_ns_replica_next_jobid;
opal_list_t orte_ns_replica_cell_tracker;
opal_list_t orte_ns_replica_name_tracker;
orte_rml_tag_t orte_ns_replica_next_rml_tag;
orte_data_type_t orte_ns_replica_next_dti;
opal_list_t orte_ns_replica_taglist;
opal_list_t orte_ns_replica_dtlist;
int orte_ns_replica_debug;
opal_mutex_t orte_ns_replica_mutex;
int orte_ns_replica_isolate;
/* /*
* don't really need this function - could just put NULL in the above structure * don't really need this function - could just put NULL in the above structure
@ -214,13 +216,28 @@ int orte_ns_replica_isolate;
*/ */
int orte_ns_replica_open(void) int orte_ns_replica_open(void)
{ {
int id; int id, param;
id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false); id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false);
mca_base_param_lookup_int(id, &orte_ns_replica_debug); mca_base_param_lookup_int(id, &orte_ns_replica.debug);
id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false); id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false);
mca_base_param_lookup_int(id, &orte_ns_replica_isolate); mca_base_param_lookup_int(id, &param);
if (param) {
orte_ns_replica.isolate = true;
} else {
orte_ns_replica.isolate = false;
}
id = mca_base_param_register_int("ns", "replica", "maxsize", NULL,
ORTE_NS_ARRAY_MAX_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.max_size = (size_t)param;
id = mca_base_param_register_int("ns", "replica", "blocksize", NULL,
ORTE_NS_ARRAY_BLOCK_SIZE);
mca_base_param_lookup_int(id, &param);
orte_ns_replica.block_size = (size_t)param;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -235,16 +252,13 @@ int orte_ns_replica_close(void)
mca_ns_base_module_t* orte_ns_replica_init(int *priority) mca_ns_base_module_t* orte_ns_replica_init(int *priority)
{ {
orte_ns_replica_name_tracker_t *new_nt; int rc;
/* If we are to host a replica, then we want to be selected, so do all the /* If we are to host a replica, then we want to be selected, so do all the
setup and return the module */ setup and return the module */
if (NULL == orte_process_info.ns_replica_uri) { if (NULL == orte_process_info.ns_replica_uri) {
orte_ns_replica_next_cellid = 0;
orte_ns_replica_next_jobid = 1; /* jobid 0 reserved for universe */
/* Return a module (choose an arbitrary, positive priority -- /* Return a module (choose an arbitrary, positive priority --
it's only relevant compared to other ns components). If it's only relevant compared to other ns components). If
we're not the seed, then we don't want to be selected, so we're not the seed, then we don't want to be selected, so
@ -252,43 +266,55 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority)
*priority = 50; *priority = 50;
/* initialize the cell tracker */ /* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells),
OBJ_CONSTRUCT(&orte_ns_replica_cell_tracker, opal_list_t); orte_ns_replica.block_size,
orte_ns_replica_next_cellid = 0; orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
/* initialize the name tracker */ ORTE_ERROR_LOG(rc);
return NULL;
OBJ_CONSTRUCT(&orte_ns_replica_name_tracker, opal_list_t); }
orte_ns_replica.num_cells = 0;
/* initialize the job id tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.jobids),
orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_jobids = 0;
/* initialize the taglist */ /* initialize the taglist */
OBJ_CONSTRUCT(&orte_ns_replica_taglist, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags),
orte_ns_replica_next_rml_tag = ORTE_RML_TAG_DYNAMIC; orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_tags = 0;
/* initialize the dtlist */ /* initialize the dtlist */
OBJ_CONSTRUCT(&orte_ns_replica_dtlist, opal_list_t); if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts),
orte_ns_replica_next_dti = ORTE_DPS_ID_DYNAMIC; orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.num_dts = 0;
/* setup the thread lock */ /* setup the thread lock */
OBJ_CONSTRUCT(&orte_ns_replica_mutex, opal_mutex_t); OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t);
/* setup the "0" job counter - this is the default one that belongs to
* all daemons. Seed must automatically have it.
*/
new_nt = OBJ_NEW(orte_ns_replica_name_tracker_t);
if (NULL == new_nt) { /* out of memory */
return NULL;
}
new_nt->job = 0;
new_nt->last_used_vpid = 0;
opal_list_append(&orte_ns_replica_name_tracker, &new_nt->item);
/* Return the module */ /* Return the module */
initialized = true; initialized = true;
return &orte_ns_replica; return &orte_ns_replica_module;
} else { } else {
return NULL; return NULL;
} }
@ -297,7 +323,7 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority)
int orte_ns_replica_module_init(void) int orte_ns_replica_module_init(void)
{ {
int rc; int rc;
if (orte_ns_replica_isolate) { if (orte_ns_replica.isolate) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -316,32 +342,48 @@ int orte_ns_replica_module_init(void)
*/ */
int orte_ns_replica_finalize(void) int orte_ns_replica_finalize(void)
{ {
orte_ns_replica_tagitem_t *tagitem; orte_ns_replica_cell_tracker_t **cptr;
orte_ns_replica_dti_t *dti; orte_ns_replica_jobid_tracker_t **jptr;
orte_ns_replica_tagitem_t **tag;
orte_ns_replica_dti_t **dti;
size_t i;
if (orte_ns_replica_debug) {
opal_output(0, "finalizing ns replica");
}
/* free all tracking storage, but only if this component was initialized */ /* free all tracking storage, but only if this component was initialized */
if (initialized) { if (initialized) {
/* OBJ_DESTRUCT(&orte_ns_replica_name_tracker); */ cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
while (NULL != (tagitem = (orte_ns_replica_tagitem_t*)opal_list_remove_first(&orte_ns_replica_taglist))) { for (i=0; i < (orte_ns_replica.cells)->size; i++) {
OBJ_RELEASE(tagitem); if (NULL != cptr[i]) {
OBJ_RELEASE(cptr[i]);
}
} }
OBJ_DESTRUCT(&orte_ns_replica_taglist); OBJ_RELEASE(orte_ns_replica.cells);
while (NULL != (dti = (orte_ns_replica_dti_t*)opal_list_remove_first(&orte_ns_replica_dtlist))) {
OBJ_RELEASE(dti); jptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (i=0; i < (orte_ns_replica.jobids)->size; i++) {
if (NULL != jptr[i]) {
OBJ_RELEASE(jptr[i]);
}
} }
OBJ_DESTRUCT(&orte_ns_replica_dtlist); OBJ_RELEASE(orte_ns_replica.jobids);
OBJ_DESTRUCT(&orte_ns_replica_mutex);
tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0; i < (orte_ns_replica.tags)->size; i++) {
if (NULL != tag[i]) OBJ_RELEASE(tag[i]);
}
OBJ_RELEASE(orte_ns_replica.tags);
dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0; i < (orte_ns_replica.dts)->size; i++) {
if (NULL != dti[i]) OBJ_RELEASE(dti[i]);
}
OBJ_RELEASE(orte_ns_replica.dts);
initialized = false; initialized = false;
} }
/* All done */ /* All done */
if (orte_ns_replica_isolate) { if (orte_ns_replica.isolate) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -527,7 +569,50 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
case ORTE_NS_CREATE_MY_NAME_CMD: case ORTE_NS_CREATE_MY_NAME_CMD:
/* ignore this command */ /* ignore this command */
goto CLEANUP; break;
case ORTE_NS_DUMP_CELLS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_JOBIDS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_TAGS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_DATATYPES_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break; break;
default: default: