diff --git a/orte/mca/ns/base/Makefile.am b/orte/mca/ns/base/Makefile.am index d2b2ede6f9..4068d26bc3 100644 --- a/orte/mca/ns/base/Makefile.am +++ b/orte/mca/ns/base/Makefile.am @@ -34,10 +34,9 @@ libmca_ns_base_la_SOURCES = \ ns_base_close.c \ ns_base_select.c \ ns_base_open.c \ - ns_base_nds.c \ ns_base_local_fns.c \ - data_type_support/ns_data_type_packing_fns.c \ - data_type_support/ns_data_type_unpacking_fns.c + data_type_support/ns_data_type_packing_fns.c \ + data_type_support/ns_data_type_unpacking_fns.c # Conditionally install the header files diff --git a/orte/mca/ns/base/base.h b/orte/mca/ns/base/base.h index f3e1320bcb..e15e061d71 100644 --- a/orte/mca/ns/base/base.h +++ b/orte/mca/ns/base/base.h @@ -39,6 +39,9 @@ extern "C" { #endif +/* default limits */ +#define ORTE_NS_ARRAY_MAX_SIZE INT_MAX +#define ORTE_NS_ARRAY_BLOCK_SIZE 512 /* * Internal definitions */ @@ -55,12 +58,18 @@ typedef uint8_t orte_ns_cmd_bitmask_t; /* * define flag values for remote commands - only used internally */ -#define ORTE_NS_CREATE_CELLID_CMD 0x01 -#define ORTE_NS_CREATE_JOBID_CMD 0x02 -#define ORTE_NS_RESERVE_RANGE_CMD 0x04 -#define ORTE_NS_ASSIGN_OOB_TAG_CMD 0x08 -#define ORTE_NS_DEFINE_DATA_TYPE_CMD 0x10 -#define ORTE_NS_CREATE_MY_NAME_CMD 0x20 +#define ORTE_NS_CREATE_CELLID_CMD (int8_t)0x01 +#define ORTE_NS_CREATE_JOBID_CMD (int8_t)0x02 +#define ORTE_NS_RESERVE_RANGE_CMD (int8_t)0x04 +#define ORTE_NS_ASSIGN_OOB_TAG_CMD (int8_t)0x08 +#define ORTE_NS_GET_JOB_PEERS_CMD (int8_t)0x0A +#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t)0x10 +#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t)0x20 +#define ORTE_NS_DUMP_CELLS_CMD (int8_t)0x21 +#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t)0x22 +#define ORTE_NS_DUMP_TAGS_CMD (int8_t)0x23 +#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t)0x24 + /* * function definitions @@ -119,6 +128,9 @@ OMPI_DECLSPEC int orte_ns_base_compare(orte_ns_cmp_bitmask_t fields, OMPI_DECLSPEC int orte_ns_base_free_name(orte_process_name_t **name); +OMPI_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer, int output_id); + + /* not available functions */ OMPI_DECLSPEC int orte_ns_base_module_init_not_available(void); @@ -146,6 +158,14 @@ OMPI_DECLSPEC int orte_ns_base_define_data_type_not_available( OMPI_DECLSPEC int orte_ns_base_create_my_name_not_available(void); +OMPI_DECLSPEC int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs, + size_t *num_procs, orte_jobid_t job); + +OMPI_DECLSPEC int orte_ns_base_dump_cells_not_available(int output_id); +OMPI_DECLSPEC int orte_ns_base_dump_jobs_not_available(int output_id); +OMPI_DECLSPEC int orte_ns_base_dump_tags_not_available(int output_id); +OMPI_DECLSPEC int orte_ns_base_dump_datatypes_not_available(int output_id); + /* Base functions used everywhere */ OMPI_DECLSPEC int orte_ns_base_get_peers(orte_process_name_t **procs, size_t *num_procs, size_t *self); diff --git a/orte/mca/ns/base/ns_base_local_fns.c b/orte/mca/ns/base/ns_base_local_fns.c index d8dd0d1294..fe6840f364 100644 --- a/orte/mca/ns/base/ns_base_local_fns.c +++ b/orte/mca/ns/base/ns_base_local_fns.c @@ -111,6 +111,43 @@ orte_ns_base_create_my_name_not_available(void) return ORTE_ERR_UNREACH; } +int orte_ns_base_get_job_peers_not_available(orte_process_name_t **procs, + size_t *num_procs, orte_jobid_t job) +{ + *procs = NULL; + *num_procs = 0; + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_dump_cells_not_available(int output_id) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_dump_jobs_not_available(int output_id) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_dump_tags_not_available(int output_id) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + +int +orte_ns_base_dump_datatypes_not_available(int output_id) +{ + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); + return ORTE_ERR_UNREACH; +} + /* @@ -566,3 +603,72 @@ int orte_ns_base_free_name(orte_process_name_t **name) return ORTE_SUCCESS; } + +int orte_ns_base_get_peers(orte_process_name_t **procs, + size_t *num_procs, size_t *self) +{ + size_t i; + int rc; + orte_cellid_t mycellid; + orte_jobid_t myjobid; + orte_vpid_t myvpid; + + *procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * + sizeof(orte_process_name_t)); + if (NULL == *procs) { + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_ns.get_cellid(&mycellid, orte_process_info.my_name))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != orte_ns.get_jobid(&myjobid, orte_process_info.my_name)) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != orte_ns.get_vpid(&myvpid, orte_process_info.my_name)) { + ORTE_ERROR_LOG(rc); + return rc; + } + + for (i=0; i < orte_process_info.num_procs; i++) { + (*procs)[i].cellid = mycellid; + (*procs)[i].jobid = myjobid; + (*procs)[i].vpid = orte_process_info.vpid_start + i; + } + + *num_procs = orte_process_info.num_procs; + *self = (size_t)(myvpid - orte_process_info.vpid_start); + + return ORTE_SUCCESS; +} + + +/* + * DIAGNOSTIC FUNCTIONS + */ +int orte_ns_base_print_dump(orte_buffer_t *buffer, int output_id) +{ + char *line; + size_t n; + orte_data_type_t type; + int rc; + + n = 1; + while (ORTE_SUCCESS == orte_dps.peek(buffer, &type, &n)) { + if (ORTE_SUCCESS != + (rc = orte_dps.unpack(buffer, &line, &n, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_output(output_id, "%s", line); + free(line); + n=1; + } + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/ns/base/ns_base_open.c b/orte/mca/ns/base/ns_base_open.c index b5aac5a7ae..4bb3c6dc8b 100644 --- a/orte/mca/ns/base/ns_base_open.c +++ b/orte/mca/ns/base/ns_base_open.c @@ -48,36 +48,50 @@ orte_process_name_t orte_name_all = {ORTE_CELLID_MAX, ORTE_JOBID_MAX, ORTE_VPID_ */ int mca_ns_base_output = -1; OMPI_DECLSPEC mca_ns_base_module_t orte_ns = { + /* init */ orte_ns_base_module_init_not_available, + /* cell functions */ orte_ns_base_create_cellid_not_available, + orte_ns_base_get_cellid, orte_ns_base_get_cell_info_not_available, orte_ns_base_assign_cellid_to_process, + orte_ns_base_get_cellid_string, + orte_ns_base_convert_cellid_to_string, + orte_ns_base_convert_string_to_cellid, + /* jobid functions */ orte_ns_base_create_jobid_not_available, + orte_ns_base_get_jobid, + orte_ns_base_get_jobid_string, + orte_ns_base_convert_jobid_to_string, + orte_ns_base_convert_string_to_jobid, + /* vpid functions */ + orte_ns_base_get_vpid_range_not_available, + orte_ns_base_get_vpid, + orte_ns_base_get_vpid_string, + orte_ns_base_convert_vpid_to_string, + orte_ns_base_convert_string_to_vpid, + /* name functions */ orte_ns_base_create_process_name, orte_ns_base_create_my_name_not_available, orte_ns_base_copy_process_name, orte_ns_base_convert_string_to_process_name, - orte_ns_base_get_vpid_range_not_available, orte_ns_base_free_name, orte_ns_base_get_proc_name_string, - orte_ns_base_get_vpid_string, - orte_ns_base_convert_vpid_to_string, - orte_ns_base_convert_string_to_vpid, - orte_ns_base_get_jobid_string, - orte_ns_base_convert_jobid_to_string, - orte_ns_base_convert_string_to_jobid, - orte_ns_base_get_cellid_string, - orte_ns_base_convert_cellid_to_string, - orte_ns_base_convert_string_to_cellid, - orte_ns_base_get_vpid, - orte_ns_base_get_jobid, - orte_ns_base_get_cellid, orte_ns_base_compare, - orte_ns_base_derive_vpid, + /* peer functions */ + orte_ns_base_get_peers, + orte_ns_base_get_job_peers_not_available, + /* tag server functions */ orte_ns_base_assign_rml_tag_not_available, + /* data type functions */ orte_ns_base_define_data_type_not_available, - orte_ns_base_get_peers + /* diagnostic functions */ + orte_ns_base_dump_cells_not_available, + orte_ns_base_dump_jobs_not_available, + orte_ns_base_dump_tags_not_available, + orte_ns_base_dump_datatypes_not_available }; + bool mca_ns_base_selected = false; opal_list_t mca_ns_base_components_available; mca_ns_base_component_t mca_ns_base_selected_component; diff --git a/orte/mca/ns/ns.h b/orte/mca/ns/ns.h index 857e1a23cd..018f4aa234 100644 --- a/orte/mca/ns/ns.h +++ b/orte/mca/ns/ns.h @@ -54,6 +54,7 @@ extern "C" { */ typedef int (*orte_ns_base_module_init_fn_t)(void); +/**** CELL FUNCTIONS ****/ /** * Create a new cell id. * The create_cellid() function allocates a new cell id for use by the caller. @@ -122,6 +123,81 @@ typedef int (*orte_ns_base_module_get_cell_info_fn_t)(orte_cellid_t cellid, */ typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_name_t* name); +/** + * Get the cell id as a character string. + * The get_cellid_string() function returns the cell id in a character string + * representation. The string is created by expressing the field in hexadecimal. Memory + * for the string is allocated by the function - releasing that allocation is the + * responsibility of the calling program. + * + * @param *name A pointer to the name structure containing the name to be + * "translated" to a string. + * + * @retval *name_string A pointer to the character string representation of the + * cell id. + * @retval NULL Indicates an error occurred - either no memory could be allocated + * or the caller provided an incorrect name pointer (e.g., NULL). + * + * @code + * cellid-string = ompi_name_server.get_cellid_string(&name) + * @endcode + */ +typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name); + +/** + * Convert cellid to character string + * Returns the cellid in a character string representation. The string is created + * by expressing the provided cellid in hexadecimal. Memory for the string is + * allocated by the function - releasing that allocation is the responsibility of + * the calling program. + * + * @param cellid The cellid to be converted. + * + * @retval *cellid_string A pointer to a character string representation of the cellid. + * @retval NULL Indicates an error occurred - probably no memory could be allocated. + * + * @code + * cellid-string = ompi_name_server.convert_cellid_to_string(cellid); + * @endcode + */ + typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid); + + /** + * Convert a string to a cellid. + * Converts a characters string into a cellid. The character string must be a + * hexadecimal representation of a valid cellid. + * + * @param cellidstring The string to be converted. + * + * @retval cellid The resulting cellid + * @retval MCA_NS_BASE_CELLID_MAX String could not be converted. + * + * @code + * cellid = ompi_name_server.convert_string_to_cellid(cellidstring); + * @endcode + */ +typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring); + + +/** + * Get the cell id as a numberic value. + * The get_cellid() function returns the cell id in a numeric representation - + * i.e., in an integer form. + * + * @param *name A pointer to the name structure containing the name. + * + * @retval cellid The cell id field of the provided name. + * @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that + * the name variable provided was NULL. + * + * @code + * cellid = ompi_name_server.get_cellid(&name) + * @endcode + */ +typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name); + + +/**** JOB ID FUNCTIONS ****/ /** * Create a new job id. * The create_jobid() function allocates a new job id for use by the caller. @@ -148,91 +224,6 @@ typedef int (*orte_ns_base_module_assign_cellid_to_process_fn_t)(orte_process_na */ typedef int (*orte_ns_base_module_create_jobid_fn_t)(orte_jobid_t *jobid); -/** - * Obtain a single new process name. - * The create_process_name() function creates a single process name structure and fills the - * fields with the provided values. - * - * @param cell The cell for which the process name is intended. Usually, this is - * the id of the cell where the process is initially planning to be spawned. - * @param job The id of the job to which the process will belong. Process id's are - * tracked according to jobid, but not cellid. Thus, two processes - * can have the same process id if and only if they have different jobid's. However, - * two processes in the same jobid cannot have the same process id, regardless - * of whether or not they are in the same cell. - * @param vpid The virtual process id for the name. Note that no check is made for uniqueness - - * the caller is responsible for ensuring that the requested name is, in fact, unique - * by first requesting reservation of an appropriate range of virtual process id's. - * - * @retval *name Pointer to an ompi_process_name_t structure containing the name. - * @retval NULL Indicates an error, probably due to inability to allocate memory for - * the name structure. - * - * @code - * new_name = ompi_name_server.create_process_name(cell, job, vpid); - * @endcode - */ -typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name, - orte_cellid_t cell, - orte_jobid_t job, - orte_vpid_t vpid); - -/* - * Create my name - * If a process is a singleton, then it needs to create a name for itself. When - * a persistent daemon is present, this requires a communication to that daemon. - * Since the RML uses process names as its index into the RML communicator table, - * the RML automatically assigns a name to each process when it first attempts - * to communicate. This function takes advantage of that behavior to ensure that - * one, and ONLY one, name gets assigned to the process - */ -typedef int (*orte_ns_base_module_create_my_name_fn_t)(void); - - -/** - * Derive a process vpid. - * Given a base vpid and an offset, return the computed equivalent vpid. This function - * is required because the vpid may not be an integer - need to provide a means for - * computing the resulting vpid in case it isn't. - */ -typedef int (*orte_ns_base_module_derive_vpid_fn_t)(orte_vpid_t *vpid, - orte_vpid_t base_vpid, - int offset); - -/** - * Make a copy of a process name. - * Given a process name, this function creates a copy of it and returns a pointer - * to the duplicate structure. - * - * @param *name Pointer to an existing process name structure. - * - * @retval *newname Pointer to the duplicate structure, with all fields transferred. - * @retval NULL Indicates an error - most likely due to a NULL process name - * pointer being supplied as input. - */ -typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest, - orte_process_name_t* src); - -/** - * Convert a string representation to a process name. - * The convert_string_to_process_name() function converts a string representation of a process - * name into an Open MPI name structure. The string must be of the proper form - i.e., it - * must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form. - * - * @param *name_string A character string representation of a process name. - * - * @retval *name Pointer to an ompi_process_name_t structure containing the name. - * @retval NULL Indicates an error, probably due to inability to allocate memory for - * the name structure. - * - * @code - * name = ompi_name_server.convert_string_to_process_name(name_string); - * @endcode - */ -typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name, - const char* name_string); - - /** * Reserve a range of process id's. * The reserve_range() function reserves a range of vpid's for the given jobid. @@ -255,112 +246,6 @@ typedef int (*orte_ns_base_module_reserve_range_fn_t)(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *startvpid); - -/** - * Free (release) a process name. - * The free_name() function releases the process name from the "used" list - * maintained within the name server for the jobid contained in the specified - * name. The memory for the name is also released at that time. - * - * Name values are currently \em not re-used. Hence, free-ing a name - * does not provide any noticeable benefit other than releasing the memory. In - * the future, names may be re-used if this becomes desirable. - * - * @param *name A pointer to the name structure containing the name being released. - * - * @retval OMPI_SUCCESS Indicates the release was succesfully accomplished. - * @retval OMPI_ERROR Indicates the release failed - most likely due to an - * error when free-ing the memory allocation. - * - * @code - * if (OMPI_ERROR == ompi_name_server.free_name(&name) { - * report error - * } - * @endcode - */ -typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name); - -/** - * Get the process name as a character string. - * The get_proc_name_string() function returns the entire process name in a - * character string representation. The string is created by expressing each - * field in hexadecimal separated by periods, as follows: - * - * sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid) - * - * The memory required for the string is allocated by the function - releasing - * that allocation is the responsibility of the calling program. - * - * @param *name A pointer to the name structure containing the name to be - * "translated" to a string. - * - * @retval *name_string A pointer to the character string representation of the - * full name. - * @retval NULL Indicates an error occurred - either no memory could be allocated - * or the caller provided an incorrect name pointer (e.g., NULL). - * - * @code - * name-string = ompi_name_server.get_proc_name_string(&name) - * @endcode - */ -typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string, - const orte_process_name_t* name); - -/** - * Get the virtual process id as a character string. - * The get_vpid_string() function returns the vpid in a character string - * representation. The string is created by expressing the field in hexadecimal. Memory - * for the string is allocated by the function - releasing that allocation is the - * responsibility of the calling program. - * - * @param *name A pointer to the name structure containing the name to be - * "translated" to a string. - * - * @retval *name_string A pointer to the character string representation of the - * vpid. - * @retval NULL Indicates an error occurred - either no memory could be allocated - * or the caller provided an incorrect name pointer (e.g., NULL). - * - * @code - * vpid-string = ompi_name_server.get_vpid_string(&name) - * @endcode - */ -typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, const orte_process_name_t* name); - -/** - * Convert vpid to character string - * Returns the vpid in a character string representation. The string is created - * by expressing the provided vpid in hexadecimal. Memory for the string is - * allocated by the function - releasing that allocation is the responsibility of - * the calling program. - * - * @param vpid The vpid to be converted. - * - * @retval *vpid_string A pointer to a character string representation of the vpid. - * @retval NULL Indicates an error occurred - probably no memory could be allocated. - * - * @code - * vpid-string = ompi_name_server.convert_vpid_to_string(vpid); - * @endcode - */ - typedef int (*orte_ns_base_module_convert_vpid_to_string_fn_t)(char **vpid_string, const orte_vpid_t vpid); - - /** - * Convert a string to a vpid. - * Converts a characters string into a vpid. The character string must be a - * hexadecimal representation of a valid vpid. - * - * @param vpidstring The string to be converted. - * - * @retval vpid The resulting vpid - * @retval MCA_NS_BASE_VPID_MAX String could not be converted. - * - * @code - * vpid = ompi_name_server.convert_string_to_vpid(vpidstring); - * @endcode - */ -typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring); - /** * Get the job id as a character string. * The get_jobid_string() function returns the job id in a character string @@ -418,78 +303,6 @@ typedef int (*orte_ns_base_module_convert_jobid_to_string_fn_t)(char **jobid_str */ typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jobid, const char* jobidstring); -/** - * Get the cell id as a character string. - * The get_cellid_string() function returns the cell id in a character string - * representation. The string is created by expressing the field in hexadecimal. Memory - * for the string is allocated by the function - releasing that allocation is the - * responsibility of the calling program. - * - * @param *name A pointer to the name structure containing the name to be - * "translated" to a string. - * - * @retval *name_string A pointer to the character string representation of the - * cell id. - * @retval NULL Indicates an error occurred - either no memory could be allocated - * or the caller provided an incorrect name pointer (e.g., NULL). - * - * @code - * cellid-string = ompi_name_server.get_cellid_string(&name) - * @endcode - */ -typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name); - -/** - * Convert cellid to character string - * Returns the cellid in a character string representation. The string is created - * by expressing the provided cellid in hexadecimal. Memory for the string is - * allocated by the function - releasing that allocation is the responsibility of - * the calling program. - * - * @param cellid The cellid to be converted. - * - * @retval *cellid_string A pointer to a character string representation of the cellid. - * @retval NULL Indicates an error occurred - probably no memory could be allocated. - * - * @code - * cellid-string = ompi_name_server.convert_cellid_to_string(cellid); - * @endcode - */ - typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid); - - /** - * Convert a string to a cellid. - * Converts a characters string into a cellid. The character string must be a - * hexadecimal representation of a valid cellid. - * - * @param cellidstring The string to be converted. - * - * @retval cellid The resulting cellid - * @retval MCA_NS_BASE_CELLID_MAX String could not be converted. - * - * @code - * cellid = ompi_name_server.convert_string_to_cellid(cellidstring); - * @endcode - */ -typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring); - -/** - * Get the virtual process id as a numeric value. - * The get_vpid() function returns the vpid in a numeric representation - - * i.e., in an integer form. - * - * @param *name A pointer to the name structure containing the name. - * - * @retval vpid The vpid field of the provided name. - * @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that - * the name variable provided was NULL. - * - * @code - * vpid = ompi_name_server.get_vpid(&name) - * @endcode - */ -typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name); - /** * Get the job id as a numeric value. * The get_jobid() function returns the job id in a numeric representation - @@ -507,22 +320,132 @@ typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_p */ typedef int (*orte_ns_base_module_get_jobid_fn_t)(orte_jobid_t *jobid, const orte_process_name_t* name); + + +/**** NAME FUNCTIONS ****/ /** - * Get the cell id as a numberic value. - * The get_cellid() function returns the cell id in a numeric representation - - * i.e., in an integer form. + * Obtain a single new process name. + * The create_process_name() function creates a single process name structure and fills the + * fields with the provided values. * - * @param *name A pointer to the name structure containing the name. + * @param cell The cell for which the process name is intended. Usually, this is + * the id of the cell where the process is initially planning to be spawned. + * @param job The id of the job to which the process will belong. Process id's are + * tracked according to jobid, but not cellid. Thus, two processes + * can have the same process id if and only if they have different jobid's. However, + * two processes in the same jobid cannot have the same process id, regardless + * of whether or not they are in the same cell. + * @param vpid The virtual process id for the name. Note that no check is made for uniqueness - + * the caller is responsible for ensuring that the requested name is, in fact, unique + * by first requesting reservation of an appropriate range of virtual process id's. * - * @retval cellid The cell id field of the provided name. - * @retval MCA_NS_BASE_CELLID_MAX Indicates that an error occurred - in this case, that - * the name variable provided was NULL. + * @retval *name Pointer to an ompi_process_name_t structure containing the name. + * @retval NULL Indicates an error, probably due to inability to allocate memory for + * the name structure. * * @code - * cellid = ompi_name_server.get_cellid(&name) + * new_name = ompi_name_server.create_process_name(cell, job, vpid); * @endcode */ -typedef int (*orte_ns_base_module_get_cellid_fn_t)(orte_cellid_t *cellid, const orte_process_name_t* name); +typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name, + orte_cellid_t cell, + orte_jobid_t job, + orte_vpid_t vpid); + +/* + * Create my name + * If a process is a singleton, then it needs to create a name for itself. When + * a persistent daemon is present, this requires a communication to that daemon. + * Since the RML uses process names as its index into the RML communicator table, + * the RML automatically assigns a name to each process when it first attempts + * to communicate. This function takes advantage of that behavior to ensure that + * one, and ONLY one, name gets assigned to the process + */ +typedef int (*orte_ns_base_module_create_my_name_fn_t)(void); + +/** + * Make a copy of a process name. + * Given a process name, this function creates a copy of it and returns a pointer + * to the duplicate structure. + * + * @param *name Pointer to an existing process name structure. + * + * @retval *newname Pointer to the duplicate structure, with all fields transferred. + * @retval NULL Indicates an error - most likely due to a NULL process name + * pointer being supplied as input. + */ +typedef int (*orte_ns_base_module_copy_proc_name_fn_t)(orte_process_name_t **dest, + orte_process_name_t* src); + +/** + * Convert a string representation to a process name. + * The convert_string_to_process_name() function converts a string representation of a process + * name into an Open MPI name structure. The string must be of the proper form - i.e., it + * must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form. + * + * @param *name_string A character string representation of a process name. + * + * @retval *name Pointer to an ompi_process_name_t structure containing the name. + * @retval NULL Indicates an error, probably due to inability to allocate memory for + * the name structure. + * + * @code + * name = ompi_name_server.convert_string_to_process_name(name_string); + * @endcode + */ +typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name, + const char* name_string); + + +/** + * Free (release) a process name. + * The free_name() function releases the process name from the "used" list + * maintained within the name server for the jobid contained in the specified + * name. The memory for the name is also released at that time. + * + * Name values are currently \em not re-used. Hence, free-ing a name + * does not provide any noticeable benefit other than releasing the memory. In + * the future, names may be re-used if this becomes desirable. + * + * @param *name A pointer to the name structure containing the name being released. + * + * @retval OMPI_SUCCESS Indicates the release was succesfully accomplished. + * @retval OMPI_ERROR Indicates the release failed - most likely due to an + * error when free-ing the memory allocation. + * + * @code + * if (OMPI_ERROR == ompi_name_server.free_name(&name) { + * report error + * } + * @endcode + */ +typedef int (*orte_ns_base_module_free_name_fn_t)(orte_process_name_t **name); + +/** + * Get the process name as a character string. + * The get_proc_name_string() function returns the entire process name in a + * character string representation. The string is created by expressing each + * field in hexadecimal separated by periods, as follows: + * + * sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid) + * + * The memory required for the string is allocated by the function - releasing + * that allocation is the responsibility of the calling program. + * + * @param *name A pointer to the name structure containing the name to be + * "translated" to a string. + * + * @retval *name_string A pointer to the character string representation of the + * full name. + * @retval NULL Indicates an error occurred - either no memory could be allocated + * or the caller provided an incorrect name pointer (e.g., NULL). + * + * @code + * name-string = ompi_name_server.get_proc_name_string(&name) + * @endcode + */ +typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string, + const orte_process_name_t* name); /** * Compare two name values. @@ -557,6 +480,81 @@ typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields, const orte_process_name_t* name1, const orte_process_name_t* name2); + +/**** VPID FUNCTIONS ****/ +/** + * Get the virtual process id as a character string. + * The get_vpid_string() function returns the vpid in a character string + * representation. The string is created by expressing the field in hexadecimal. Memory + * for the string is allocated by the function - releasing that allocation is the + * responsibility of the calling program. + * + * @param *name A pointer to the name structure containing the name to be + * "translated" to a string. + * + * @retval *name_string A pointer to the character string representation of the + * vpid. + * @retval NULL Indicates an error occurred - either no memory could be allocated + * or the caller provided an incorrect name pointer (e.g., NULL). + * + * @code + * vpid-string = ompi_name_server.get_vpid_string(&name) + * @endcode + */ +typedef int (*orte_ns_base_module_get_vpid_string_fn_t)(char **vpid_string, const orte_process_name_t* name); + +/** + * Convert vpid to character string + * Returns the vpid in a character string representation. The string is created + * by expressing the provided vpid in hexadecimal. Memory for the string is + * allocated by the function - releasing that allocation is the responsibility of + * the calling program. + * + * @param vpid The vpid to be converted. + * + * @retval *vpid_string A pointer to a character string representation of the vpid. + * @retval NULL Indicates an error occurred - probably no memory could be allocated. + * + * @code + * vpid-string = ompi_name_server.convert_vpid_to_string(vpid); + * @endcode + */ + typedef int (*orte_ns_base_module_convert_vpid_to_string_fn_t)(char **vpid_string, const orte_vpid_t vpid); + + /** + * Convert a string to a vpid. + * Converts a characters string into a vpid. The character string must be a + * hexadecimal representation of a valid vpid. + * + * @param vpidstring The string to be converted. + * + * @retval vpid The resulting vpid + * @retval MCA_NS_BASE_VPID_MAX String could not be converted. + * + * @code + * vpid = ompi_name_server.convert_string_to_vpid(vpidstring); + * @endcode + */ +typedef int (*orte_ns_base_module_convert_string_to_vpid_fn_t)(orte_vpid_t *vpid, const char* vpidstring); + +/** + * Get the virtual process id as a numeric value. + * The get_vpid() function returns the vpid in a numeric representation - + * i.e., in an integer form. + * + * @param *name A pointer to the name structure containing the name. + * + * @retval vpid The vpid field of the provided name. + * @retval MCA_NS_BASE_VPID_MAX Indicates that an error occurred - in this case, that + * the name variable provided was NULL. + * + * @code + * vpid = ompi_name_server.get_vpid(&name) + * @endcode + */ +typedef int (*orte_ns_base_module_get_vpid_fn_t)(orte_vpid_t *vpid, const orte_process_name_t *name); + +/**** TAG SERVER ****/ /* * Allocate a tag * If name is NULL, tag server provides next unique tag but cannot look @@ -565,6 +563,7 @@ typedef int (*orte_ns_base_module_compare_fn_t)(orte_ns_cmp_bitmask_t fields, typedef int (*orte_ns_base_module_assign_rml_tag_fn_t)(orte_rml_tag_t *tag, char *name); +/**** DATA TYPE SERVER ****/ /* This function defines a new data type and gives it a system-wide unique * identifier for use in the data packing subsystem. Only called from the * dps when needing a new identifier. @@ -573,6 +572,8 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)( const char *name, orte_data_type_t *type); + +/**** PEER RETRIEVAL ****/ /* * Get my peers * @@ -582,40 +583,75 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)( typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs, size_t *num_procs, size_t *self); - +/* + * Get the list of peers from a specified job + * + * THIS FUNCTION MAY BE ELIMINATED IN FUTURE VERSIONS TO REMOVE MULTIPLE STORAGE + * OF O(N) ARRAYS IN THE SYSTEM + */ +typedef int (*orte_ns_base_module_get_job_peers_fn_t)(orte_process_name_t **procs, + size_t *num_procs, orte_jobid_t job); + + + +/* + * DIAGNOSTIC INTERFACES + */ +typedef int (*orte_ns_base_module_dump_cells_fn_t)(int output_id); + +typedef int (*orte_ns_base_module_dump_jobs_fn_t)(int output_id); + +typedef int (*orte_ns_base_module_dump_tags_fn_t)(int output_id); + +typedef int (*orte_ns_base_module_dump_datatypes_fn_t)(int output_id); + + /* * Ver 1.0.0 */ struct mca_ns_base_module_1_0_0_t { + /* init */ orte_ns_base_module_init_fn_t init; + /* cell functions */ orte_ns_base_module_create_cellid_fn_t create_cellid; + orte_ns_base_module_get_cellid_fn_t get_cellid; orte_ns_base_module_get_cell_info_fn_t get_cell_info; orte_ns_base_module_assign_cellid_to_process_fn_t assign_cellid_to_process; + orte_ns_base_module_get_cellid_string_fn_t get_cellid_string; + orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string; + orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid; + /* jobid functions */ orte_ns_base_module_create_jobid_fn_t create_jobid; + orte_ns_base_module_get_jobid_fn_t get_jobid; + orte_ns_base_module_get_jobid_string_fn_t get_jobid_string; + orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string; + orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid; + /* vpid functions */ + orte_ns_base_module_reserve_range_fn_t reserve_range; + orte_ns_base_module_get_vpid_fn_t get_vpid; + orte_ns_base_module_get_vpid_string_fn_t get_vpid_string; + orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string; + orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid; + /* name functions */ orte_ns_base_module_create_proc_name_fn_t create_process_name; orte_ns_base_module_create_my_name_fn_t create_my_name; orte_ns_base_module_copy_proc_name_fn_t copy_process_name; orte_ns_base_module_convert_string_to_process_name_fn_t convert_string_to_process_name; - orte_ns_base_module_reserve_range_fn_t reserve_range; orte_ns_base_module_free_name_fn_t free_name; orte_ns_base_module_get_proc_name_string_fn_t get_proc_name_string; - orte_ns_base_module_get_vpid_string_fn_t get_vpid_string; - orte_ns_base_module_convert_vpid_to_string_fn_t convert_vpid_to_string; - orte_ns_base_module_convert_string_to_vpid_fn_t convert_string_to_vpid; - orte_ns_base_module_get_jobid_string_fn_t get_jobid_string; - orte_ns_base_module_convert_jobid_to_string_fn_t convert_jobid_to_string; - orte_ns_base_module_convert_string_to_jobid_fn_t convert_string_to_jobid; - orte_ns_base_module_get_cellid_string_fn_t get_cellid_string; - orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string; - orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid; - orte_ns_base_module_get_vpid_fn_t get_vpid; - orte_ns_base_module_get_jobid_fn_t get_jobid; - orte_ns_base_module_get_cellid_fn_t get_cellid; orte_ns_base_module_compare_fn_t compare; - orte_ns_base_module_derive_vpid_fn_t derive_vpid; - orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag; - orte_ns_base_module_define_data_type_fn_t define_data_type; + /* peer functions */ orte_ns_base_module_get_peers_fn_t get_peers; + orte_ns_base_module_get_job_peers_fn_t get_job_peers; + /* tag server functions */ + orte_ns_base_module_assign_rml_tag_fn_t assign_rml_tag; + /* data type functions */ + orte_ns_base_module_define_data_type_fn_t define_data_type; + /* diagnostic functions */ + orte_ns_base_module_dump_cells_fn_t dump_cells; + orte_ns_base_module_dump_jobs_fn_t dump_jobs; + orte_ns_base_module_dump_tags_fn_t dump_tags; + orte_ns_base_module_dump_datatypes_fn_t dump_datatypes; }; typedef struct mca_ns_base_module_1_0_0_t mca_ns_base_module_1_0_0_t; diff --git a/orte/mca/ns/proxy/src/ns_proxy.c b/orte/mca/ns/proxy/src/ns_proxy.c index 3055fd49b2..b0782397b2 100644 --- a/orte/mca/ns/proxy/src/ns_proxy.c +++ b/orte/mca/ns/proxy/src/ns_proxy.c @@ -21,12 +21,13 @@ #include -#include "include/orte_constants.h" -#include "include/orte_types.h" -#include "mca/mca.h" -#include "dps/dps.h" -#include "mca/errmgr/errmgr.h" -#include "mca/rml/rml.h" +#include "orte/include/orte_constants.h" +#include "orte/include/orte_types.h" +#include "opal/mca/mca.h" +#include "opal/util/output.h" +#include "orte/dps/dps.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" #include "ns_proxy.h" @@ -43,19 +44,13 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc orte_buffer_t* cmd; orte_buffer_t* answer; orte_ns_cmd_flag_t command; - size_t count; + size_t count, index; int rc; - orte_ns_proxy_cell_info_t *cptr; + orte_ns_proxy_cell_info_t *new_cell; /* set the default value of error */ *cellid = ORTE_CELLID_MAX; - /* check for errors */ - if (NULL == site || NULL == resource) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } - command = ORTE_NS_CREATE_CELLID_CMD; cmd = OBJ_NEW(orte_buffer_t); @@ -82,7 +77,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc return rc; } - if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, MCA_OOB_TAG_NS, 0)) { + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); return ORTE_ERR_COMM_FAILURE; @@ -95,7 +90,7 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc return ORTE_ERR_OUT_OF_RESOURCE; } - if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); return ORTE_ERR_COMM_FAILURE; @@ -123,17 +118,31 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc OBJ_RELEASE(answer); /* store the info locally for later retrieval */ - cptr = OBJ_NEW(orte_ns_proxy_cell_info_t); - if (NULL == cptr) { + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + new_cell = OBJ_NEW(orte_ns_proxy_cell_info_t); + if (NULL == new_cell) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_OUT_OF_RESOURCE; } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, + orte_ns_proxy.cells, new_cell))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + if (NULL != site) { + new_cell->site = strdup(site); + } + if (NULL != resource) { + new_cell->resource = strdup(resource); + } + + new_cell->cellid = orte_ns_proxy.num_cells; + *cellid = new_cell->cellid; + (orte_ns_proxy.num_cells)++; - cptr->cellid = *cellid; - cptr->site = strdup(site); - cptr->resource = strdup(resource); - opal_list_append(&orte_ns_proxy_cell_info_list, &cptr->item); - + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_SUCCESS; } @@ -141,23 +150,29 @@ int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resourc int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, char **site, char **resource) { - opal_list_item_t *item; - orte_ns_proxy_cell_info_t *cell; + size_t i, j; + orte_ns_proxy_cell_info_t **cell; - *site = NULL; - *resource = NULL; - - for (item = opal_list_get_first(&orte_ns_proxy_cell_info_list); - item != opal_list_get_end(&orte_ns_proxy_cell_info_list); - item = opal_list_get_next(item)) { - cell = (orte_ns_proxy_cell_info_t*)item; - if (cellid == cell->cellid) { - *site = strdup(cell->site); - *resource = strdup(cell->resource); - return ORTE_SUCCESS; - } - } - return ORTE_ERR_NOT_FOUND; + /* see if we already have the info locally */ + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + cell = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr; + for (i=0, j=0; j < orte_ns_proxy.num_cells && + i < (orte_ns_proxy.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + if (cellid == cell[i]->cellid) { + *site = strdup(cell[i]->site); + *resource = strdup(cell[i]->resource); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; + } + } + } + + /* okay, don't have it locally - go ask for it */ + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; } int orte_ns_proxy_create_jobid(orte_jobid_t *job) @@ -183,7 +198,7 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job) return rc; } - if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); return ORTE_ERR_COMM_FAILURE; @@ -195,7 +210,7 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *job) OBJ_RELEASE(answer); return ORTE_ERR_OUT_OF_RESOURCE; } - if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); return ORTE_ERR_COMM_FAILURE; @@ -261,7 +276,7 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t return rc; } - if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); return ORTE_ERR_COMM_FAILURE; @@ -274,7 +289,7 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t return ORTE_ERR_OUT_OF_RESOURCE; } - if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); return ORTE_ERR_COMM_FAILURE; @@ -299,32 +314,122 @@ int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t } +/* + * PEER functions + */ +int orte_ns_proxy_get_job_peers(orte_process_name_t **procs, + size_t *num_procs, orte_jobid_t job) +{ + orte_buffer_t* cmd; + orte_buffer_t* answer; + orte_ns_cmd_flag_t command; + size_t count; + int rc; + + /* set default value */ + *procs = NULL; + *num_procs = 0; + + if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_NS_GET_JOB_PEERS_CMD; + if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { /* got a problem */ + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(cmd); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_RELEASE(cmd); + + if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + if (ORTE_NS_GET_JOB_PEERS_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_RELEASE(answer); + return ORTE_ERR_COMM_FAILURE; + } + + count = 1; + if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &num_procs, &count, ORTE_SIZE))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + + /* allocate space for array of proc names */ + if (0 < *num_procs) { + *procs = (orte_process_name_t*)malloc((*num_procs) * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(answer); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, procs, num_procs, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(answer); + return rc; + } + } + + OBJ_RELEASE(answer); + return ORTE_SUCCESS; +} + + int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, char *name) { orte_buffer_t* cmd; orte_buffer_t* answer; orte_ns_cmd_flag_t command; - orte_ns_proxy_tagitem_t* tagitem; - size_t count; + orte_ns_proxy_tagitem_t* tagitem, **tags; + size_t count, i, j; int rc; - OPAL_THREAD_LOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); if (NULL != name) { - /* first, check to see if name is already on local list - * if so, return tag - */ - for (tagitem = (orte_ns_proxy_tagitem_t*)opal_list_get_first(&orte_ns_proxy_taglist); - tagitem != (orte_ns_proxy_tagitem_t*)opal_list_get_end(&orte_ns_proxy_taglist); - tagitem = (orte_ns_proxy_tagitem_t*)opal_list_get_next(tagitem)) { - if (0 == strcmp(name, tagitem->name)) { /* found name on list */ - *tag = tagitem->tag; - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); - return ORTE_SUCCESS; + /* see if this name is already in list - if so, return tag */ + tags = (orte_ns_proxy_tagitem_t**)orte_ns_proxy.tags->addr; + for (i=0, j=0; j < orte_ns_proxy.num_tags && + i < (orte_ns_proxy.tags)->size; i++) { + if (NULL != tags[i]) { + j++; + if (tags[i]->name != NULL && + 0 == strcmp(name, tags[i]->name)) { /* found name on list */ + *tag = tags[i]->tag; + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; + } } } - } + } /* okay, not on local list - so go get one from tag server */ command = ORTE_NS_ASSIGN_OOB_TAG_CMD; @@ -332,14 +437,14 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_OUT_OF_RESOURCE; } if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return rc; } @@ -350,28 +455,28 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, if (0 > (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return rc; } - if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_COMM_FAILURE; } OBJ_RELEASE(cmd); if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_OUT_OF_RESOURCE; } - if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_COMM_FAILURE; } @@ -379,14 +484,14 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return rc; } if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_COMM_FAILURE; } @@ -394,7 +499,7 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, tag, &count, ORTE_UINT32))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return rc; } @@ -404,18 +509,23 @@ int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, tagitem = OBJ_NEW(orte_ns_proxy_tagitem_t); if (NULL == tagitem) { /* out of memory */ ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - *tag = ORTE_RML_TAG_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_OUT_OF_RESOURCE; } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_proxy.tags, tagitem))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } tagitem->tag = *tag; - if (NULL != name) { + (orte_ns_proxy.num_tags)++; + if (NULL != name) { /* provided - can look it up later */ tagitem->name = strdup(name); } else { tagitem->name = NULL; } - opal_list_append(&orte_ns_proxy_taglist, &tagitem->item); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); /* all done */ return ORTE_SUCCESS; @@ -428,8 +538,8 @@ int orte_ns_proxy_define_data_type(const char *name, orte_buffer_t* cmd; orte_buffer_t* answer; orte_ns_cmd_flag_t command; - orte_ns_proxy_dti_t *dti; - size_t count; + orte_ns_proxy_dti_t **dti, *dtip; + size_t count, i, j; int rc=ORTE_SUCCESS; if (NULL == name || 0 < *type) { @@ -437,20 +547,25 @@ int orte_ns_proxy_define_data_type(const char *name, return ORTE_ERR_BAD_PARAM; } - OPAL_THREAD_LOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); /* first, check to see if name is already on local list * if so, return id, ensure registered with dps */ - for (dti = (orte_ns_proxy_dti_t*)opal_list_get_first(&orte_ns_proxy_dtlist); - dti != (orte_ns_proxy_dti_t*)opal_list_get_end(&orte_ns_proxy_dtlist); - dti = (orte_ns_proxy_dti_t*)opal_list_get_next(dti)) { - if (0 == strcmp(name, dti->name)) { /* found name on list */ - *type = dti->id; - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); - return ORTE_SUCCESS; + dti = (orte_ns_proxy_dti_t**)orte_ns_proxy.dts->addr; + for (i=0, j=0; j < orte_ns_proxy.num_dts && + i < orte_ns_proxy.dts->size; i++) { + if (NULL != dti[i]) { + j++; + if (dti[i]->name != NULL && + 0 == strcmp(name, dti[i]->name)) { /* found name on list */ + *type = dti[i]->id; + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_SUCCESS; + } } } + /* okay, not on local list - so go get one from tag server */ command = ORTE_NS_DEFINE_DATA_TYPE_CMD; @@ -458,42 +573,42 @@ int orte_ns_proxy_define_data_type(const char *name, if ((cmd = OBJ_NEW(orte_buffer_t)) == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_OUT_OF_RESOURCE; } if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, (void*)&command, 1, ORTE_NS_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return rc; } if (ORTE_SUCCESS != (rc = orte_dps.pack(cmd, &name, 1, ORTE_STRING))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return rc; } - if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, ORTE_RML_TAG_NS, 0)) { + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, ORTE_RML_TAG_NS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_COMM_FAILURE; } OBJ_RELEASE(cmd); if ((answer = OBJ_NEW(orte_buffer_t)) == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_OUT_OF_RESOURCE; } - if (0 > orte_rml.recv_buffer(orte_ns_my_replica, answer, ORTE_RML_TAG_NS)) { + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, answer, ORTE_RML_TAG_NS)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_COMM_FAILURE; } @@ -501,14 +616,14 @@ int orte_ns_proxy_define_data_type(const char *name, if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &count, ORTE_NS_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return rc; } if (ORTE_NS_ASSIGN_OOB_TAG_CMD != command) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_COMM_FAILURE; } @@ -516,23 +631,29 @@ int orte_ns_proxy_define_data_type(const char *name, if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, type, &count, ORTE_DATA_TYPE))) { ORTE_ERROR_LOG(ORTE_ERR_UNPACK_FAILURE); OBJ_RELEASE(answer); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_UNPACK_FAILURE; } OBJ_RELEASE(answer); /* add the new id to the local list so we don't have to get it again */ - dti = OBJ_NEW(orte_ns_proxy_dti_t); - if (NULL == dti) { /* out of memory */ + dtip = OBJ_NEW(orte_ns_proxy_dti_t); + if (NULL == dtip) { /* out of memory */ ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - *type = ORTE_DPS_ID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); return ORTE_ERR_OUT_OF_RESOURCE; } - dti->id = *type; - dti->name = strdup(name); - opal_list_append(&orte_ns_proxy_taglist, &dti->item); - OPAL_THREAD_UNLOCK(&orte_ns_proxy_mutex); + dtip->name = strdup(name); + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_proxy.dts, dtip))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return rc; + } + dtip->id = *type; + (orte_ns_proxy.num_dts)++; + + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); /* all done */ return rc; @@ -565,7 +686,7 @@ int orte_ns_proxy_create_my_name(void) return rc; } - if (0 > orte_rml.send_buffer(orte_ns_my_replica, cmd, MCA_OOB_TAG_NS, 0)) { + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, cmd, MCA_OOB_TAG_NS, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(cmd); return ORTE_ERR_COMM_FAILURE; @@ -575,3 +696,235 @@ int orte_ns_proxy_create_my_name(void) return ORTE_SUCCESS; } +/* + * DIAGNOSTIC functions + */ +int orte_ns_proxy_dump_cells(int output_id) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + size_t i, j; + orte_ns_proxy_cell_info_t **ptr; + int rc; + + command = ORTE_NS_DUMP_CELLS_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica cell tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + /* dump local cell tracker */ + opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Cell Tracker\n", + ORTE_NAME_ARGS(orte_process_info.my_name)); + ptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr; + for (i=0, j=0; j < orte_ns_proxy.num_cells && + i < (orte_ns_proxy.cells)->size; i++) { + if (NULL != ptr[i]) { + j++; + opal_output(output_id, "Num: %lu\tCell: %lu\n", + (unsigned long)j, (unsigned long)ptr[i]->cellid); + } + } + + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_dump_jobs(int output_id) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + int rc; + + command = ORTE_NS_DUMP_JOBIDS_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica jobid tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_dump_tags(int output_id) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + size_t i, j; + orte_ns_proxy_tagitem_t **ptr; + int rc; + + command = ORTE_NS_DUMP_TAGS_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica tag tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + /* dump local tag tracker */ + opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n", + ORTE_NAME_ARGS(orte_process_info.my_name)); + ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr; + for (i=0, j=0; j < orte_ns_proxy.num_tags && + i < (orte_ns_proxy.tags)->size; i++) { + if (NULL != ptr[i]) { + j++; + opal_output(output_id, "Num: %lu\tTag: %lu\tTag name: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name); + } + } + + return ORTE_SUCCESS; +} + + +int orte_ns_proxy_dump_datatypes(int output_id) +{ + orte_buffer_t cmd; + orte_buffer_t answer; + orte_ns_cmd_flag_t command; + size_t i, j; + orte_ns_proxy_dti_t **ptr; + int rc; + + command = ORTE_NS_DUMP_DATATYPES_CMD; + + OPAL_THREAD_LOCK(&orte_ns_proxy.mutex); + + /* dump name service replica datatype tracker */ + OBJ_CONSTRUCT(&cmd, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_dps.pack(&cmd, &command, 1, ORTE_NS_CMD))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + OBJ_DESTRUCT(&cmd); + return rc; + } + + if (0 > orte_rml.send_buffer(orte_ns_proxy.my_replica, &cmd, MCA_OOB_TAG_NS, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&cmd); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + OBJ_DESTRUCT(&cmd); + + OBJ_CONSTRUCT(&answer, orte_buffer_t); + if (0 > orte_rml.recv_buffer(orte_ns_proxy.my_replica, &answer, ORTE_RML_TAG_NS)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + OBJ_DESTRUCT(&answer); + OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex); + return ORTE_ERR_COMM_FAILURE; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&answer); + return rc; + } + + /* dump local datatype tracker */ + opal_output(output_id, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n", + ORTE_NAME_ARGS(orte_process_info.my_name)); + ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr; + for (i=0, j=0; j < orte_ns_proxy.num_dts && + i < (orte_ns_proxy.dts)->size; i++) { + if (NULL != ptr[i]) { + j++; + opal_output(output_id, "Num: %lu\tDatatype id: %lu\tDatatype name: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name); + } + } + + return ORTE_SUCCESS; +} + + diff --git a/orte/mca/ns/proxy/src/ns_proxy.h b/orte/mca/ns/proxy/src/ns_proxy.h index 1a6bad2d19..a312b706c6 100644 --- a/orte/mca/ns/proxy/src/ns_proxy.h +++ b/orte/mca/ns/proxy/src/ns_proxy.h @@ -32,7 +32,7 @@ extern "C" { #endif struct orte_ns_proxy_cell_info_t { - opal_list_item_t item; /**< Allows this item to be placed on a list */ + opal_object_t super; orte_cellid_t cellid; char *site; char *resource; @@ -42,7 +42,7 @@ typedef struct orte_ns_proxy_cell_info_t orte_ns_proxy_cell_info_t; OBJ_CLASS_DECLARATION(orte_ns_proxy_cell_info_t); struct orte_ns_proxy_tagitem_t { - opal_list_item_t item; /**< Allows this item to be placed on a list */ + opal_object_t super; orte_rml_tag_t tag; /**< OOB tag */ char *name; /**< Name associated with tag */ }; @@ -51,7 +51,7 @@ typedef struct orte_ns_proxy_tagitem_t orte_ns_proxy_tagitem_t; OBJ_CLASS_DECLARATION(orte_ns_proxy_tagitem_t); struct orte_ns_proxy_dti_t { - opal_list_item_t item; /**< Allows this item to be placed on a list */ + opal_object_t super; orte_data_type_t id; /**< data type id */ char *name; /**< Name associated with data type */ }; @@ -77,13 +77,20 @@ int orte_ns_proxy_finalize(void); /* * globals used within proxy component */ +typedef struct { + size_t max_size, block_size; + orte_process_name_t *my_replica; + int debug; + orte_cellid_t num_cells; + orte_pointer_array_t *cells; + orte_pointer_array_t *tags; + orte_rml_tag_t num_tags; + orte_pointer_array_t *dts; + orte_data_type_t num_dts; + opal_mutex_t mutex; +} orte_ns_proxy_globals_t; -extern orte_process_name_t *orte_ns_my_replica; -extern int orte_ns_proxy_debug; -extern opal_list_t orte_ns_proxy_cell_info_list; -extern opal_list_t orte_ns_proxy_taglist; -extern opal_list_t orte_ns_proxy_dtlist; -extern opal_mutex_t orte_ns_proxy_mutex; +extern orte_ns_proxy_globals_t orte_ns_proxy; /* * proxy function prototypes @@ -97,6 +104,9 @@ int orte_ns_proxy_create_jobid(orte_jobid_t *jobid); int orte_ns_proxy_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *startvpid); +int orte_ns_proxy_get_job_peers(orte_process_name_t **procs, + size_t *num_procs, orte_jobid_t job); + int orte_ns_proxy_assign_rml_tag(orte_rml_tag_t *tag, char *name); int orte_ns_proxy_define_data_type(const char *name, @@ -104,6 +114,19 @@ int orte_ns_proxy_define_data_type(const char *name, int orte_ns_proxy_create_my_name(void); +/* + * Diagnostic functions + */ +int orte_ns_proxy_dump_cells(int output_id); + +int orte_ns_proxy_dump_jobs(int output_id); + +int orte_ns_proxy_dump_tags(int output_id); + +int orte_ns_proxy_dump_datatypes(int output_id); + + + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/orte/mca/ns/proxy/src/ns_proxy_component.c b/orte/mca/ns/proxy/src/ns_proxy_component.c index ead68ce16d..804eb95ff3 100644 --- a/orte/mca/ns/proxy/src/ns_proxy_component.c +++ b/orte/mca/ns/proxy/src/ns_proxy_component.c @@ -64,36 +64,49 @@ OMPI_COMP_EXPORT mca_ns_base_component_t mca_ns_proxy_component = { /* * setup the function pointers for the module */ -static mca_ns_base_module_t orte_ns_proxy = { +static mca_ns_base_module_t orte_ns_proxy_module = { + /* init */ orte_ns_proxy_module_init, + /* cell functions */ orte_ns_proxy_create_cellid, + orte_ns_base_get_cellid, orte_ns_proxy_get_cell_info, orte_ns_base_assign_cellid_to_process, + orte_ns_base_get_cellid_string, + orte_ns_base_convert_cellid_to_string, + orte_ns_base_convert_string_to_cellid, + /* jobid functions */ orte_ns_proxy_create_jobid, + orte_ns_base_get_jobid, + orte_ns_base_get_jobid_string, + orte_ns_base_convert_jobid_to_string, + orte_ns_base_convert_string_to_jobid, + /* vpid functions */ + orte_ns_proxy_reserve_range, + orte_ns_base_get_vpid, + orte_ns_base_get_vpid_string, + orte_ns_base_convert_vpid_to_string, + orte_ns_base_convert_string_to_vpid, + /* name functions */ orte_ns_base_create_process_name, orte_ns_proxy_create_my_name, orte_ns_base_copy_process_name, orte_ns_base_convert_string_to_process_name, - orte_ns_proxy_reserve_range, orte_ns_base_free_name, orte_ns_base_get_proc_name_string, - orte_ns_base_get_vpid_string, - orte_ns_base_convert_vpid_to_string, - orte_ns_base_convert_string_to_vpid, - orte_ns_base_get_jobid_string, - orte_ns_base_convert_jobid_to_string, - orte_ns_base_convert_string_to_jobid, - orte_ns_base_get_cellid_string, - orte_ns_base_convert_cellid_to_string, - orte_ns_base_convert_string_to_cellid, - orte_ns_base_get_vpid, - orte_ns_base_get_jobid, - orte_ns_base_get_cellid, orte_ns_base_compare, - orte_ns_base_derive_vpid, + /* peer functions */ + orte_ns_base_get_peers, + orte_ns_proxy_get_job_peers, + /* tag server functions */ orte_ns_proxy_assign_rml_tag, + /* data type functions */ orte_ns_proxy_define_data_type, - orte_ns_base_get_peers + /* diagnostic functions */ + orte_ns_proxy_dump_cells, + orte_ns_proxy_dump_jobs, + orte_ns_proxy_dump_tags, + orte_ns_proxy_dump_datatypes }; /* @@ -122,7 +135,7 @@ static void orte_ns_proxy_cell_info_destructor(orte_ns_proxy_cell_info_t* ptr) /* define instance of opal_class_t */ OBJ_CLASS_INSTANCE( orte_ns_proxy_cell_info_t, /* type name */ - opal_list_item_t, /* parent "class" name */ + opal_object_t, /* parent "class" name */ orte_ns_proxy_cell_info_construct, /* constructor */ orte_ns_proxy_cell_info_destructor); /* destructor */ @@ -144,7 +157,7 @@ static void orte_ns_proxy_tagitem_destructor(orte_ns_proxy_tagitem_t* tagitem) /* define instance of opal_class_t */ OBJ_CLASS_INSTANCE( orte_ns_proxy_tagitem_t, /* type name */ - opal_list_item_t, /* parent "class" name */ + opal_object_t, /* parent "class" name */ orte_ns_proxy_tagitem_construct, /* constructor */ orte_ns_proxy_tagitem_destructor); /* destructor */ @@ -166,7 +179,7 @@ static void orte_ns_proxy_dti_destructor(orte_ns_proxy_dti_t* dti) /* define instance of opal_class_t */ OBJ_CLASS_INSTANCE( orte_ns_proxy_dti_t, /* type name */ - opal_list_item_t, /* parent "class" name */ + opal_object_t, /* parent "class" name */ orte_ns_proxy_dti_construct, /* constructor */ orte_ns_proxy_dti_destructor); /* destructor */ @@ -174,22 +187,28 @@ OBJ_CLASS_INSTANCE( * globals needed within proxy component */ -orte_process_name_t* orte_ns_my_replica=NULL; -int orte_ns_proxy_debug=0; -opal_list_t orte_ns_proxy_cell_info_list; -opal_list_t orte_ns_proxy_taglist; -opal_list_t orte_ns_proxy_dtlist; -opal_mutex_t orte_ns_proxy_mutex; +orte_ns_proxy_globals_t orte_ns_proxy; + /* - * Open the proxy component and obtain the name of my replica. + * Open the proxy component and obtain the name of my proxy. */ int orte_ns_proxy_open(void) { - int id; + int id, param; id = mca_base_param_register_int("ns", "proxy", "debug", NULL, 0); - mca_base_param_lookup_int(id, &orte_ns_proxy_debug); + mca_base_param_lookup_int(id, &orte_ns_proxy.debug); + + id = mca_base_param_register_int("ns", "proxy", "maxsize", NULL, + ORTE_NS_ARRAY_MAX_SIZE); + mca_base_param_lookup_int(id, ¶m); + orte_ns_proxy.max_size = (size_t)param; + + id = mca_base_param_register_int("ns", "proxy", "blocksize", NULL, + ORTE_NS_ARRAY_BLOCK_SIZE); + mca_base_param_lookup_int(id, ¶m); + orte_ns_proxy.block_size = (size_t)param; return ORTE_SUCCESS; } @@ -205,9 +224,9 @@ int orte_ns_proxy_close(void) mca_ns_base_module_t* orte_ns_proxy_init(int *priority) { orte_process_name_t name; - int ret; + int ret, rc; - /* If we are NOT to host a replica, then we want to be selected, so do all + /* If we are NOT to host a proxy, then we want to be selected, so do all the setup and return the module */ /* opal_output(mca_ns_base_output, "ns_proxy: entered init\n"); */ if (NULL != orte_process_info.ns_replica_uri) { @@ -219,7 +238,7 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority) *priority = 10; - /* define the replica for us to use */ + /* define the proxy for us to use */ if(ORTE_SUCCESS != (ret = orte_rml.parse_uris(orte_process_info.ns_replica_uri, &name, NULL))) { ORTE_ERROR_LOG(ret); return NULL; @@ -228,24 +247,51 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority) ORTE_ERROR_LOG(ret); return NULL; } - if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_my_replica, + if (ORTE_SUCCESS != orte_ns_base_copy_process_name(&orte_ns_proxy.my_replica, orte_process_info.ns_replica)) { /* can't operate */ return NULL; } - /* initialize the cell info list */ - OBJ_CONSTRUCT(&orte_ns_proxy_cell_info_list, opal_list_t); + /* initialize the cell info tracker */ + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.cells), + orte_ns_proxy.block_size, + orte_ns_proxy.max_size, + orte_ns_proxy.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_proxy.num_cells = 0; + - /* initialize the taglist */ - OBJ_CONSTRUCT(&orte_ns_proxy_taglist, opal_list_t); + /* initialize the taglist */ - /* initialize the dtlist */ - OBJ_CONSTRUCT(&orte_ns_proxy_dtlist, opal_list_t); + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.tags), + orte_ns_proxy.block_size, + orte_ns_proxy.max_size, + orte_ns_proxy.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_proxy.num_tags = 0; + /* initialize the dtlist */ + + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.dts), + orte_ns_proxy.block_size, + orte_ns_proxy.max_size, + orte_ns_proxy.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_proxy.num_dts = 0; + + /* setup the thread lock */ + OBJ_CONSTRUCT(&orte_ns_proxy.mutex, opal_mutex_t); + /* Return the module */ initialized = true; - return &orte_ns_proxy; + return &orte_ns_proxy_module; } else { return NULL; @@ -267,24 +313,34 @@ int orte_ns_proxy_module_init(void) */ int orte_ns_proxy_finalize(void) { - orte_ns_proxy_tagitem_t *tagitem; - orte_ns_proxy_cell_info_t *cptr; + orte_ns_proxy_cell_info_t **cptr; + orte_ns_proxy_tagitem_t **tag; + orte_ns_proxy_dti_t **dti; + size_t i; - if (orte_ns_proxy_debug) { - opal_output(0, "finalizing ns proxy"); - } - - /* free the storage, but only if this component was initialized */ + /* free all tracking storage, but only if this component was initialized */ if (initialized) { - while (NULL != (cptr = (orte_ns_proxy_cell_info_t*)opal_list_remove_first(&orte_ns_proxy_cell_info_list))) { - OBJ_RELEASE(cptr); + cptr = (orte_ns_proxy_cell_info_t**)(orte_ns_proxy.cells)->addr; + for (i=0; i < (orte_ns_proxy.cells)->size; i++) { + if (NULL != cptr[i]) { + OBJ_RELEASE(cptr[i]); + } } - OBJ_DESTRUCT(&orte_ns_proxy_cell_info_list); - while (NULL != (tagitem = (orte_ns_proxy_tagitem_t*)opal_list_remove_first(&orte_ns_proxy_taglist))) { - OBJ_RELEASE(tagitem); + OBJ_RELEASE(orte_ns_proxy.cells); + + tag = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr; + for (i=0; i < (orte_ns_proxy.tags)->size; i++) { + if (NULL != tag[i]) OBJ_RELEASE(tag[i]); } - OBJ_DESTRUCT(&orte_ns_proxy_taglist); + OBJ_RELEASE(orte_ns_proxy.tags); + + dti = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr; + for (i=0; i < (orte_ns_proxy.dts)->size; i++) { + if (NULL != dti[i]) OBJ_RELEASE(dti[i]); + } + OBJ_RELEASE(orte_ns_proxy.dts); + initialized = false; } diff --git a/orte/mca/ns/replica/src/ns_replica.c b/orte/mca/ns/replica/src/ns_replica.c index 52734ad572..f5a35bfcf3 100644 --- a/orte/mca/ns/replica/src/ns_replica.c +++ b/orte/mca/ns/replica/src/ns_replica.c @@ -32,6 +32,7 @@ /** * globals */ +#define NS_REPLICA_MAX_STRING_SIZE 256 /* * functions @@ -40,225 +41,593 @@ int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource) { orte_ns_replica_cell_tracker_t *new_cell; + int rc; + size_t index; - OPAL_THREAD_LOCK(&orte_ns_replica_mutex); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); - if (ORTE_CELLID_MAX > orte_ns_replica_next_cellid) { - *cellid = orte_ns_replica_next_cellid; - orte_ns_replica_next_cellid++; - new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t); - if (NULL == new_cell) { - *cellid = ORTE_CELLID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - new_cell->cell = *cellid; - new_cell->site = strdup(site); - new_cell->resource = strdup(resource); - opal_list_append(&orte_ns_replica_cell_tracker, &new_cell->item); - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - return ORTE_SUCCESS; - } - *cellid = ORTE_CELLID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; + + /* check if cellid is available. NOTE: need to reserve + * ORTE_CELLID_MAX as an invalid value, so can't allow + * num_cells to get there + */ + if (ORTE_CELLID_MAX-2 < orte_ns_replica.num_cells) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t); + if (NULL == new_cell) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, + orte_ns_replica.cells, new_cell))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + new_cell->site = strdup(site); + new_cell->resource = strdup(resource); + + new_cell->cell = orte_ns_replica.num_cells; + *cellid = new_cell->cell; + (orte_ns_replica.num_cells)++; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; } int orte_ns_replica_get_cell_info(orte_cellid_t cellid, char **site, char **resource) { - opal_list_item_t *item; - orte_ns_replica_cell_tracker_t *cell; + size_t i, j; + orte_ns_replica_cell_tracker_t **cell; - for (item = opal_list_get_first(&orte_ns_replica_cell_tracker); - item != opal_list_get_end(&orte_ns_replica_cell_tracker); - item = opal_list_get_next(item)) { - cell = (orte_ns_replica_cell_tracker_t*)item; - if (cellid == cell->cell) { - *site = strdup(cell->site); - *resource = strdup(cell->resource); - return ORTE_SUCCESS; - } - } + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0, j=0; j < orte_ns_replica.num_cells && + i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + if (cellid == cell[i]->cell) { + *site = strdup(cell[i]->site); + *resource = strdup(cell[i]->resource); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + } + } + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); return ORTE_ERR_NOT_FOUND; } +/* + * JOBID functions + */ int orte_ns_replica_create_jobid(orte_jobid_t *jobid) { - orte_ns_replica_name_tracker_t *new_nt; - - OPAL_THREAD_LOCK(&orte_ns_replica_mutex); - - if (ORTE_JOBID_MAX > orte_ns_replica_next_jobid) { - *jobid = orte_ns_replica_next_jobid; - orte_ns_replica_next_jobid++; - new_nt = OBJ_NEW(orte_ns_replica_name_tracker_t); - if (NULL == new_nt) { /* out of memory */ - *jobid = ORTE_JOBID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - new_nt->job = *jobid; - new_nt->last_used_vpid = 0; - opal_list_append(&orte_ns_replica_name_tracker, &new_nt->item); - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - return ORTE_SUCCESS; - } + orte_ns_replica_jobid_tracker_t *ptr; + int rc; + size_t index; + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + *jobid = ORTE_JOBID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; -} - - -int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *start) -{ - orte_ns_replica_name_tracker_t *ptr; - - OPAL_THREAD_LOCK(&orte_ns_replica_mutex); - - for (ptr = (orte_ns_replica_name_tracker_t*)opal_list_get_first(&orte_ns_replica_name_tracker); - ptr != (orte_ns_replica_name_tracker_t*)opal_list_get_end(&orte_ns_replica_name_tracker); - ptr = (orte_ns_replica_name_tracker_t*)opal_list_get_next(ptr)) { - if (job == ptr->job) { /* found the specified job */ - if ((ORTE_VPID_MAX-range) >= ptr->last_used_vpid) { /* requested range available */ - *start = ptr->last_used_vpid; - if (0 == job && *start == 0) { /* vpid=0 reserved for job=0 */ - *start = 1; - } - ptr->last_used_vpid = *start + range; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - return ORTE_SUCCESS; - } else { /* range not available */ - *start = ORTE_VPID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - } + /* check if a jobid is available. NOTE: need to + * reserve ORTE_JOBID_MAX as an invalid value, so can't let + * num_jobids get there + */ + if (ORTE_JOBID_MAX-2 < orte_ns_replica.num_jobids) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + ptr = OBJ_NEW(orte_ns_replica_jobid_tracker_t); + if (NULL == ptr) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, + orte_ns_replica.jobids, ptr))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(ptr); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; } - /* get here if the job couldn't be found */ - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + ptr->jobid = orte_ns_replica.num_jobids; + *jobid = ptr->jobid; + (orte_ns_replica.num_jobids)++; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; } + + +int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, + orte_vpid_t *start) +{ + orte_ns_replica_jobid_tracker_t **ptr; + size_t j; + orte_jobid_t k; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + /* find the jobid */ + ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr; + for (j=0, k=0; k < orte_ns_replica.num_jobids && + j < (orte_ns_replica.jobids)->size; j++) { + if (NULL != ptr[j]) { + k++; + if (job == ptr[j]->jobid) { + goto PROCESS; + } + } + } + /* didn't find the specified jobid - error */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + +PROCESS: + if ((ORTE_VPID_MAX-range-(ptr[j]->next_vpid)) > 0) { + *start = ptr[j]->next_vpid; + ptr[j]->next_vpid += range; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } + + /* get here if the range isn't available */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; +} + +int orte_ns_replica_get_job_peers(orte_process_name_t **procs, + size_t *num_procs, orte_jobid_t job) +{ + orte_ns_replica_jobid_tracker_t **ptr; + orte_process_name_t *nptr; + size_t j; + orte_jobid_t k; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + /* find the jobid */ + ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr; + for (j=0, k=0; k < orte_ns_replica.num_jobids && + j < (orte_ns_replica.jobids)->size; j++) { + if (NULL != ptr[j]) { + k++; + if (job == ptr[j]->jobid) { + goto PROCESS; + } + } + } + /* didn't find the specified jobid - error */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_NOT_FOUND; + +PROCESS: + /* the field next_vpid contains the value of the next unassigned + * vpid, so the job extends from vpid=0 to that value. create + * an array of process names containing those values + */ + *procs = (orte_process_name_t*)malloc(ptr[j]->next_vpid * sizeof(orte_process_name_t)); + if (NULL == *procs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + nptr = *procs; + for (k=0; k < ptr[j]->next_vpid; k++) { + nptr->cellid = 0; + nptr->jobid = job; + nptr->vpid = (orte_vpid_t)k; + } + *num_procs = (size_t)ptr[j]->next_vpid; + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; +} + + +/* + * DIAGNOSTIC functions + */ +int orte_ns_replica_dump_cells(int output_id) +{ + orte_buffer_t buffer; + int rc; + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + return ORTE_SUCCESS; +} + +int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer) +{ + size_t i, j; + orte_ns_replica_cell_tracker_t **cell; + char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + tmp = tmp_out; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0, j=0; j < orte_ns_replica.num_cells && + i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cell[i]) { + j++; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n", + (unsigned long)j, (unsigned long)cell[i]->cell); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n", + cell[i]->site, cell[i]->resource); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + } + } + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_jobs(int output_id) +{ + orte_buffer_t buffer; + int rc; + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + return ORTE_SUCCESS; +} + +int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer) +{ + size_t i, j; + orte_ns_replica_jobid_tracker_t **ptr; + char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + tmp = tmp_out; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Jobid Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr; + for (i=0, j=0; j < orte_ns_replica.num_jobids && + i < (orte_ns_replica.jobids)->size; i++) { + if (NULL != ptr[i]) { + j++; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tJobid: %lu\tNext vpid: %lu\n", + (unsigned long)j, (unsigned long)ptr[i]->jobid, + (unsigned long)ptr[i]->next_vpid); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + } + } + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_tags(int output_id) +{ + orte_buffer_t buffer; + int rc; + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer) +{ + size_t i, j; + orte_ns_replica_tagitem_t **ptr; + char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + tmp = tmp_out; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service RML Tag Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + ptr = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr; + for (i=0, j=0; j < orte_ns_replica.num_tags && + i < (orte_ns_replica.tags)->size; i++) { + if (NULL != ptr[i]) { + j++; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tTag id: %lu\tName: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + } + } + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + + return ORTE_SUCCESS; +} + + +int orte_ns_replica_dump_datatypes(int output_id) +{ + orte_buffer_t buffer; + int rc; + + OBJ_CONSTRUCT(&buffer, orte_buffer_t); + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&buffer))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer, output_id))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buffer); + return rc; + } + + OBJ_DESTRUCT(&buffer); + return ORTE_SUCCESS; +} + +int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer) +{ + size_t i, j; + orte_ns_replica_dti_t **ptr; + char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp; + int rc; + + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); + + tmp = tmp_out; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Datatype Tracker\n"); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + ptr = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr; + for (i=0, j=0; j < orte_ns_replica.num_dts && + i < (orte_ns_replica.dts)->size; i++) { + if (NULL != ptr[i]) { + j++; + snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tDatatype id: %lu\tName: %s\n", + (unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name); + if (ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &tmp, 1, ORTE_STRING))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + } + } + + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + + return ORTE_SUCCESS; +} + + +/* + * TAG SERVER functions + */ int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag, char *name) { - orte_ns_replica_tagitem_t *tagitem; + orte_ns_replica_tagitem_t *tagitem, **tags; + size_t i, j; + int rc; - OPAL_THREAD_LOCK(&orte_ns_replica_mutex); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); if (NULL != name) { /* see if this name is already in list - if so, return tag */ - for (tagitem = (orte_ns_replica_tagitem_t*)opal_list_get_first(&orte_ns_replica_taglist); - tagitem != (orte_ns_replica_tagitem_t*)opal_list_get_end(&orte_ns_replica_taglist); - tagitem = (orte_ns_replica_tagitem_t*)opal_list_get_next(tagitem)) { - if (tagitem->name != NULL && 0 == strcmp(name, tagitem->name)) { /* found name on list */ - *tag = tagitem->tag; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - return ORTE_SUCCESS; + tags = (orte_ns_replica_tagitem_t**)orte_ns_replica.tags->addr; + for (i=0, j=0; j < orte_ns_replica.num_tags && + i < (orte_ns_replica.tags)->size; i++) { + if (NULL != tags[i]) { + j++; + if (tags[i]->name != NULL && + 0 == strcmp(name, tags[i]->name)) { /* found name on list */ + *tag = tags[i]->tag; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } } } } - /* not in list or not provided, so allocate next tag - * first check to see if one available - else error - */ - if (ORTE_RML_TAG_MAX > orte_ns_replica_next_rml_tag) { - /* okay, one available - assign it */ - tagitem = OBJ_NEW(orte_ns_replica_tagitem_t); - if (NULL == tagitem) { /* out of memory */ - *tag = ORTE_RML_TAG_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - tagitem->tag = orte_ns_replica_next_rml_tag; - if (NULL != name) { /* provided - can look it up later */ - tagitem->name = strdup(name); - } else { - tagitem->name = NULL; - } - orte_ns_replica_next_rml_tag++; - opal_list_append(&orte_ns_replica_taglist, &tagitem->item); + /* not in list or not provided, so allocate next tag */ + *tag = ORTE_RML_TAG_MAX; - *tag = tagitem->tag; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - return ORTE_SUCCESS; + /* check if tag is available - need to do this since the tag type + * is probably not going to be a size_t, so we cannot just rely + * on the pointer_array's size limits to protect us. NOTE: need to + * reserve ORTE_RML_TAG_MAX as an invalid value, so can't let + * num_tags get there + */ + if (ORTE_RML_TAG_MAX-2 < orte_ns_replica.num_tags) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; } - /* no tag available */ - *tag = ORTE_RML_TAG_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; + tagitem = OBJ_NEW(orte_ns_replica_tagitem_t); + if (NULL == tagitem) { /* out of memory */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_replica.tags, tagitem))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + tagitem->tag = orte_ns_replica.num_tags; + (orte_ns_replica.num_tags)++; + if (NULL != name) { /* provided - can look it up later */ + tagitem->name = strdup(name); + } else { + tagitem->name = NULL; + } + *tag = tagitem->tag; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; } - + +/* + * DATA TYPE SERVER functions + */ int orte_ns_replica_define_data_type(const char *name, orte_data_type_t *type) { - orte_ns_replica_dti_t *dti; + orte_ns_replica_dti_t **dti, *dtip; + size_t i, j; + int rc; if (NULL == name || 0 < *type) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } - OPAL_THREAD_LOCK(&orte_ns_replica_mutex); + OPAL_THREAD_LOCK(&orte_ns_replica.mutex); - /* see if this name is already in list - if so, return id */ - for (dti = (orte_ns_replica_dti_t*)opal_list_get_first(&orte_ns_replica_dtlist); - dti != (orte_ns_replica_dti_t*)opal_list_get_end(&orte_ns_replica_dtlist); - dti = (orte_ns_replica_dti_t*)opal_list_get_next(dti)) { - if (dti->name != NULL && 0 == strcmp(name, dti->name)) { /* found name on list */ - *type = dti->id; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - return ORTE_SUCCESS; + dti = (orte_ns_replica_dti_t**)orte_ns_replica.dts->addr; + for (i=0, j=0; j < orte_ns_replica.num_dts && + i < orte_ns_replica.dts->size; i++) { + if (NULL != dti[i]) { + j++; + if (dti[i]->name != NULL && + 0 == strcmp(name, dti[i]->name)) { /* found name on list */ + *type = dti[i]->id; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; + } } } - /* not in list or not provided, so allocate next id - * first check to see if one available - else error - */ - if (ORTE_DPS_ID_MAX > orte_ns_replica_next_dti) { - /* okay, one available - assign it */ - dti = OBJ_NEW(orte_ns_replica_dti_t); - if (NULL == dti) { /* out of memory */ - *type = ORTE_DPS_ID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - dti->id = orte_ns_replica_next_dti; - dti->name = strdup(name); - orte_ns_replica_next_dti++; - opal_list_append(&orte_ns_replica_dtlist, &dti->item); - - *type = dti->id; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - return ORTE_SUCCESS; - } - - /* no id available */ + /* not in list or not provided, so allocate next id */ *type = ORTE_DPS_ID_MAX; - OPAL_THREAD_UNLOCK(&orte_ns_replica_mutex); - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; + + /* check if id is available - need to do this since the data type + * is probably not going to be a size_t, so we cannot just rely + * on the pointer_array's size limits to protect us. + */ + if (ORTE_DPS_ID_MAX-2 < orte_ns_replica.num_dts) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + dtip = OBJ_NEW(orte_ns_replica_dti_t); + if (NULL == dtip) { /* out of memory */ + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_ERR_OUT_OF_RESOURCE; + } + dtip->name = strdup(name); + if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i, + orte_ns_replica.dts, dtip))) { + ORTE_ERROR_LOG(rc); + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return rc; + } + dtip->id = orte_ns_replica.num_dts; + (orte_ns_replica.num_dts)++; + + *type = dtip->id; + OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex); + return ORTE_SUCCESS; } + +/* + * NAME functions + */ int orte_ns_replica_create_my_name(void) { orte_jobid_t jobid; diff --git a/orte/mca/ns/replica/src/ns_replica.h b/orte/mca/ns/replica/src/ns_replica.h index 07d33dd9c2..e04db4a791 100644 --- a/orte/mca/ns/replica/src/ns_replica.h +++ b/orte/mca/ns/replica/src/ns_replica.h @@ -19,13 +19,14 @@ #define NS_REPLICA_H #include "orte_config.h" -#include "include/types.h" +#include "orte/include/orte_types.h" #include "include/orte_constants.h" #include "opal/threads/mutex.h" -#include "opal/class/opal_list.h" -#include "dps/dps.h" -#include "mca/oob/oob_types.h" -#include "mca/ns/base/base.h" +#include "opal/class/opal_object.h" +#include "orte/class/orte_pointer_array.h" +#include "orte/dps/dps.h" +#include "orte/mca/oob/oob_types.h" +#include "orte/mca/ns/base/base.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { @@ -34,7 +35,7 @@ extern "C" { /* list class for tracking cellid's */ struct orte_ns_replica_cell_tracker_t { - opal_list_item_t item; + opal_object_t super; orte_cellid_t cell; char *site; char *resource; @@ -45,21 +46,21 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_cell_tracker_t); /* - * list class for tracking vpids/jobid - * This structure is used to create a linked list of jobid-max vpid pairs. Basically, we + * object for tracking vpids/jobids + * This structure is used to track jobid-max vpid pairs. Basically, we * are tracking the max used vpid for each jobid that has been created. */ -struct orte_ns_replica_name_tracker_t { - opal_list_item_t item; /**< Allows this item to be placed on a list */ - orte_jobid_t job; /**< Job id */ - orte_vpid_t last_used_vpid; /**< Tracks the vpid last given out */ +struct orte_ns_replica_jobid_tracker_t { + opal_object_t super; + orte_jobid_t jobid; /**< Job id */ + orte_vpid_t next_vpid; }; -typedef struct orte_ns_replica_name_tracker_t orte_ns_replica_name_tracker_t; +typedef struct orte_ns_replica_jobid_tracker_t orte_ns_replica_jobid_tracker_t; -OBJ_CLASS_DECLARATION(orte_ns_replica_name_tracker_t); +OBJ_CLASS_DECLARATION(orte_ns_replica_jobid_tracker_t); struct orte_ns_replica_tagitem_t { - opal_list_item_t item; /**< Allows this item to be placed on a list */ + opal_object_t super; orte_rml_tag_t tag; /**< OOB tag */ char *name; /**< Name associated with tag */ }; @@ -68,7 +69,7 @@ typedef struct orte_ns_replica_tagitem_t orte_ns_replica_tagitem_t; OBJ_CLASS_DECLARATION(orte_ns_replica_tagitem_t); struct orte_ns_replica_dti_t { - opal_list_item_t item; /**< Allows this item to be placed on a list */ + opal_object_t super; orte_data_type_t id; /**< data type id */ char *name; /**< Name associated with data type */ }; @@ -79,16 +80,26 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_dti_t); /* * globals needed within component */ -extern orte_cellid_t orte_ns_replica_next_cellid; -extern orte_jobid_t orte_ns_replica_next_jobid; -extern opal_list_t orte_ns_replica_cell_tracker; -extern opal_list_t orte_ns_replica_name_tracker; -extern orte_rml_tag_t orte_ns_replica_next_rml_tag; -extern orte_data_type_t orte_ns_replica_next_dti; -extern opal_list_t orte_ns_replica_taglist; -extern opal_list_t orte_ns_replica_dtlist; -extern int orte_ns_replica_debug; -extern opal_mutex_t orte_ns_replica_mutex; +typedef struct { + size_t max_size, block_size; + orte_cellid_t num_cells; + orte_pointer_array_t *cells; +#if 0 + orte_jobgrp_t num_jobgrps; + orte_pointer_array_t *jobgrps; +#endif + orte_jobid_t num_jobids; + orte_pointer_array_t *jobids; + orte_pointer_array_t *tags; + orte_rml_tag_t num_tags; + orte_pointer_array_t *dts; + orte_data_type_t num_dts; + int debug; + bool isolate; + opal_mutex_t mutex; +} orte_ns_replica_globals_t; + +extern orte_ns_replica_globals_t orte_ns_replica; /* * Module open / close @@ -135,6 +146,29 @@ int orte_ns_replica_reserve_range(orte_jobid_t job, orte_vpid_t range, orte_vpid_t *startvpid); +/* + * Peer functions + */ +int orte_ns_replica_get_job_peers(orte_process_name_t **procs, + size_t *num_procs, orte_jobid_t job); + + +/* + * Diagnostic functions + */ +int orte_ns_replica_dump_cells(int output_id); +int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer); + +int orte_ns_replica_dump_jobs(int output_id); +int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer); + +int orte_ns_replica_dump_tags(int output_id); +int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer); + +int orte_ns_replica_dump_datatypes(int output_id); +int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer); + + /* * Implementation of assign rml tag */ diff --git a/orte/mca/ns/replica/src/ns_replica_component.c b/orte/mca/ns/replica/src/ns_replica_component.c index 7639dc9bb2..7a37bc5a87 100644 --- a/orte/mca/ns/replica/src/ns_replica_component.c +++ b/orte/mca/ns/replica/src/ns_replica_component.c @@ -67,36 +67,49 @@ OMPI_COMP_EXPORT mca_ns_base_component_t mca_ns_replica_component = { /* * setup the function pointers for the module */ -static mca_ns_base_module_t orte_ns_replica = { +static mca_ns_base_module_t orte_ns_replica_module = { + /* init */ orte_ns_replica_module_init, + /* cell functions */ orte_ns_replica_create_cellid, + orte_ns_base_get_cellid, orte_ns_replica_get_cell_info, orte_ns_base_assign_cellid_to_process, + orte_ns_base_get_cellid_string, + orte_ns_base_convert_cellid_to_string, + orte_ns_base_convert_string_to_cellid, + /* jobid functions */ orte_ns_replica_create_jobid, + orte_ns_base_get_jobid, + orte_ns_base_get_jobid_string, + orte_ns_base_convert_jobid_to_string, + orte_ns_base_convert_string_to_jobid, + /* vpid functions */ + orte_ns_replica_reserve_range, + orte_ns_base_get_vpid, + orte_ns_base_get_vpid_string, + orte_ns_base_convert_vpid_to_string, + orte_ns_base_convert_string_to_vpid, + /* name functions */ orte_ns_base_create_process_name, orte_ns_replica_create_my_name, orte_ns_base_copy_process_name, orte_ns_base_convert_string_to_process_name, - orte_ns_replica_reserve_range, orte_ns_base_free_name, orte_ns_base_get_proc_name_string, - orte_ns_base_get_vpid_string, - orte_ns_base_convert_vpid_to_string, - orte_ns_base_convert_string_to_vpid, - orte_ns_base_get_jobid_string, - orte_ns_base_convert_jobid_to_string, - orte_ns_base_convert_string_to_jobid, - orte_ns_base_get_cellid_string, - orte_ns_base_convert_cellid_to_string, - orte_ns_base_convert_string_to_cellid, - orte_ns_base_get_vpid, - orte_ns_base_get_jobid, - orte_ns_base_get_cellid, orte_ns_base_compare, - orte_ns_base_derive_vpid, + /* peer functions */ + orte_ns_base_get_peers, + orte_ns_replica_get_job_peers, + /* tag server functions */ orte_ns_replica_assign_rml_tag, + /* data type functions */ orte_ns_replica_define_data_type, - orte_ns_base_get_peers + /* diagnostic functions */ + orte_ns_replica_dump_cells, + orte_ns_replica_dump_jobs, + orte_ns_replica_dump_tags, + orte_ns_replica_dump_datatypes }; /* @@ -123,29 +136,28 @@ static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker /* define instance of opal_class_t */ OBJ_CLASS_INSTANCE( orte_ns_replica_cell_tracker_t, /* type name */ - opal_list_item_t, /* parent "class" name */ + opal_object_t, /* parent "class" name */ orte_ns_replica_cell_tracker_construct, /* constructor */ orte_ns_replica_cell_tracker_destructor); /* destructor */ -/* constructor - used to initialize state of name_tracker instance */ -static void orte_ns_replica_tracker_construct(orte_ns_replica_name_tracker_t* name_tracker) +/* constructor - used to initialize state of jobid_tracker instance */ +static void orte_ns_replica_jobid_tracker_construct(orte_ns_replica_jobid_tracker_t* jobid_tracker) { - name_tracker->job = 0; - name_tracker->last_used_vpid = 0; + jobid_tracker->jobid = ORTE_JOBID_MAX; + jobid_tracker->next_vpid = 0; } /* destructor - used to free any resources held by instance */ -static void orte_ns_replica_tracker_destructor(orte_ns_replica_name_tracker_t* name_tracker) -{ +static void orte_ns_replica_jobid_tracker_destructor(orte_ns_replica_jobid_tracker_t* jobid_tracker){ } /* define instance of opal_class_t */ OBJ_CLASS_INSTANCE( - orte_ns_replica_name_tracker_t, /* type name */ - opal_list_item_t, /* parent "class" name */ - orte_ns_replica_tracker_construct, /* constructor */ - orte_ns_replica_tracker_destructor); /* destructor */ + orte_ns_replica_jobid_tracker_t, /* type name */ + opal_object_t, /* parent "class" name */ + orte_ns_replica_jobid_tracker_construct, /* constructor */ + orte_ns_replica_jobid_tracker_destructor); /* destructor */ /* constructor - used to initialize state of taglist instance */ @@ -166,7 +178,7 @@ static void orte_ns_replica_tagitem_destructor(orte_ns_replica_tagitem_t* tagite /* define instance of opal_class_t */ OBJ_CLASS_INSTANCE( orte_ns_replica_tagitem_t, /* type name */ - opal_list_item_t, /* parent "class" name */ + opal_object_t, /* parent "class" name */ orte_ns_replica_tagitem_construct, /* constructor */ orte_ns_replica_tagitem_destructor); /* destructor */ @@ -189,24 +201,14 @@ static void orte_ns_replica_dti_destructor(orte_ns_replica_dti_t* dti) /* define instance of opal_class_t */ OBJ_CLASS_INSTANCE( orte_ns_replica_dti_t, /* type name */ - opal_list_item_t, /* parent "class" name */ + opal_object_t, /* parent "class" name */ orte_ns_replica_dti_construct, /* constructor */ orte_ns_replica_dti_destructor); /* destructor */ /* * globals needed within replica component */ -orte_cellid_t orte_ns_replica_next_cellid; -orte_jobid_t orte_ns_replica_next_jobid; -opal_list_t orte_ns_replica_cell_tracker; -opal_list_t orte_ns_replica_name_tracker; -orte_rml_tag_t orte_ns_replica_next_rml_tag; -orte_data_type_t orte_ns_replica_next_dti; -opal_list_t orte_ns_replica_taglist; -opal_list_t orte_ns_replica_dtlist; -int orte_ns_replica_debug; -opal_mutex_t orte_ns_replica_mutex; -int orte_ns_replica_isolate; +orte_ns_replica_globals_t orte_ns_replica; /* * don't really need this function - could just put NULL in the above structure @@ -214,13 +216,28 @@ int orte_ns_replica_isolate; */ int orte_ns_replica_open(void) { - int id; + int id, param; id = mca_base_param_register_int("ns", "replica", "debug", NULL, (int)false); - mca_base_param_lookup_int(id, &orte_ns_replica_debug); + mca_base_param_lookup_int(id, &orte_ns_replica.debug); id = mca_base_param_register_int("ns", "replica", "isolate", NULL, (int)false); - mca_base_param_lookup_int(id, &orte_ns_replica_isolate); + mca_base_param_lookup_int(id, ¶m); + if (param) { + orte_ns_replica.isolate = true; + } else { + orte_ns_replica.isolate = false; + } + + id = mca_base_param_register_int("ns", "replica", "maxsize", NULL, + ORTE_NS_ARRAY_MAX_SIZE); + mca_base_param_lookup_int(id, ¶m); + orte_ns_replica.max_size = (size_t)param; + + id = mca_base_param_register_int("ns", "replica", "blocksize", NULL, + ORTE_NS_ARRAY_BLOCK_SIZE); + mca_base_param_lookup_int(id, ¶m); + orte_ns_replica.block_size = (size_t)param; return ORTE_SUCCESS; } @@ -235,16 +252,13 @@ int orte_ns_replica_close(void) mca_ns_base_module_t* orte_ns_replica_init(int *priority) { - orte_ns_replica_name_tracker_t *new_nt; + int rc; /* If we are to host a replica, then we want to be selected, so do all the setup and return the module */ if (NULL == orte_process_info.ns_replica_uri) { - orte_ns_replica_next_cellid = 0; - orte_ns_replica_next_jobid = 1; /* jobid 0 reserved for universe */ - /* Return a module (choose an arbitrary, positive priority -- it's only relevant compared to other ns components). If we're not the seed, then we don't want to be selected, so @@ -252,43 +266,55 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority) *priority = 50; - /* initialize the cell tracker */ - - OBJ_CONSTRUCT(&orte_ns_replica_cell_tracker, opal_list_t); - orte_ns_replica_next_cellid = 0; - - /* initialize the name tracker */ - - OBJ_CONSTRUCT(&orte_ns_replica_name_tracker, opal_list_t); + /* initialize the cell info tracker */ + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells), + orte_ns_replica.block_size, + orte_ns_replica.max_size, + orte_ns_replica.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_replica.num_cells = 0; + + /* initialize the job id tracker */ + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.jobids), + orte_ns_replica.block_size, + orte_ns_replica.max_size, + orte_ns_replica.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_replica.num_jobids = 0; /* initialize the taglist */ - OBJ_CONSTRUCT(&orte_ns_replica_taglist, opal_list_t); - orte_ns_replica_next_rml_tag = ORTE_RML_TAG_DYNAMIC; + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.tags), + orte_ns_replica.block_size, + orte_ns_replica.max_size, + orte_ns_replica.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_replica.num_tags = 0; /* initialize the dtlist */ - OBJ_CONSTRUCT(&orte_ns_replica_dtlist, opal_list_t); - orte_ns_replica_next_dti = ORTE_DPS_ID_DYNAMIC; + if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.dts), + orte_ns_replica.block_size, + orte_ns_replica.max_size, + orte_ns_replica.block_size))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + orte_ns_replica.num_dts = 0; /* setup the thread lock */ - OBJ_CONSTRUCT(&orte_ns_replica_mutex, opal_mutex_t); + OBJ_CONSTRUCT(&orte_ns_replica.mutex, opal_mutex_t); - /* setup the "0" job counter - this is the default one that belongs to - * all daemons. Seed must automatically have it. - */ - new_nt = OBJ_NEW(orte_ns_replica_name_tracker_t); - if (NULL == new_nt) { /* out of memory */ - return NULL; - } - new_nt->job = 0; - new_nt->last_used_vpid = 0; - opal_list_append(&orte_ns_replica_name_tracker, &new_nt->item); - /* Return the module */ initialized = true; - return &orte_ns_replica; + return &orte_ns_replica_module; } else { return NULL; } @@ -297,7 +323,7 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority) int orte_ns_replica_module_init(void) { int rc; - if (orte_ns_replica_isolate) { + if (orte_ns_replica.isolate) { return ORTE_SUCCESS; } @@ -316,32 +342,48 @@ int orte_ns_replica_module_init(void) */ int orte_ns_replica_finalize(void) { - orte_ns_replica_tagitem_t *tagitem; - orte_ns_replica_dti_t *dti; + orte_ns_replica_cell_tracker_t **cptr; + orte_ns_replica_jobid_tracker_t **jptr; + orte_ns_replica_tagitem_t **tag; + orte_ns_replica_dti_t **dti; + size_t i; - if (orte_ns_replica_debug) { - opal_output(0, "finalizing ns replica"); - } - /* free all tracking storage, but only if this component was initialized */ if (initialized) { -/* OBJ_DESTRUCT(&orte_ns_replica_name_tracker); */ - while (NULL != (tagitem = (orte_ns_replica_tagitem_t*)opal_list_remove_first(&orte_ns_replica_taglist))) { - OBJ_RELEASE(tagitem); + cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr; + for (i=0; i < (orte_ns_replica.cells)->size; i++) { + if (NULL != cptr[i]) { + OBJ_RELEASE(cptr[i]); + } } - OBJ_DESTRUCT(&orte_ns_replica_taglist); - while (NULL != (dti = (orte_ns_replica_dti_t*)opal_list_remove_first(&orte_ns_replica_dtlist))) { - OBJ_RELEASE(dti); + OBJ_RELEASE(orte_ns_replica.cells); + + jptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr; + for (i=0; i < (orte_ns_replica.jobids)->size; i++) { + if (NULL != jptr[i]) { + OBJ_RELEASE(jptr[i]); + } } - OBJ_DESTRUCT(&orte_ns_replica_dtlist); - OBJ_DESTRUCT(&orte_ns_replica_mutex); + OBJ_RELEASE(orte_ns_replica.jobids); + + tag = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr; + for (i=0; i < (orte_ns_replica.tags)->size; i++) { + if (NULL != tag[i]) OBJ_RELEASE(tag[i]); + } + OBJ_RELEASE(orte_ns_replica.tags); + + dti = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr; + for (i=0; i < (orte_ns_replica.dts)->size; i++) { + if (NULL != dti[i]) OBJ_RELEASE(dti[i]); + } + OBJ_RELEASE(orte_ns_replica.dts); initialized = false; } /* All done */ - if (orte_ns_replica_isolate) { + if (orte_ns_replica.isolate) { return ORTE_SUCCESS; } @@ -527,7 +569,50 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender, case ORTE_NS_CREATE_MY_NAME_CMD: /* ignore this command */ - goto CLEANUP; + break; + + case ORTE_NS_DUMP_CELLS_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DUMP_JOBIDS_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DUMP_TAGS_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } + break; + + case ORTE_NS_DUMP_DATATYPES_CMD: + if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&answer))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + goto RETURN_ERROR; + } break; default: