2013-09-10 19:34:09 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2013 Mellanox Technologies, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
#ifndef OSHMEM_PROC_PROC_H
|
|
|
|
#define OSHMEM_PROC_PROC_H
|
|
|
|
|
|
|
|
#include "oshmem_config.h"
|
|
|
|
#include "oshmem/types.h"
|
|
|
|
#include "oshmem/constants.h"
|
|
|
|
|
|
|
|
#include "oshmem/mca/scoll/scoll.h"
|
|
|
|
|
|
|
|
#include "opal/class/opal_list.h"
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
#include "opal/util/proc.h"
|
2013-09-10 19:34:09 +04:00
|
|
|
#include "opal/dss/dss_types.h"
|
|
|
|
#include "opal/mca/hwloc/hwloc.h"
|
|
|
|
|
|
|
|
#include "orte/types.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
|
2015-09-17 18:20:37 +03:00
|
|
|
#include "ompi/proc/proc.h"
|
2014-02-25 19:01:10 +04:00
|
|
|
#include "ompi/communicator/communicator.h"
|
|
|
|
|
2013-09-10 19:34:09 +04:00
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
|
|
|
/* ******************************************************************** */
|
|
|
|
|
|
|
|
struct oshmem_group_t;
|
|
|
|
|
|
|
|
#define OSHMEM_PE_INVALID (-1)
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Remote Open SHMEM process structure
|
2015-06-24 06:59:57 +03:00
|
|
|
*
|
2013-09-10 19:34:09 +04:00
|
|
|
* Remote Open SHMEM process structure. Each process contains exactly
|
|
|
|
* one oshmem_proc_t structure for each remote process it knows about.
|
|
|
|
*/
|
|
|
|
struct oshmem_proc_t {
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
opal_proc_t super;
|
2013-09-10 19:34:09 +04:00
|
|
|
/* endpoint data */
|
|
|
|
void *proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MAX];
|
2015-06-24 06:59:57 +03:00
|
|
|
/*
|
|
|
|
* All transport channels are globally ordered.
|
2013-09-10 19:34:09 +04:00
|
|
|
* pe(s) can talk to each other via subset of transports
|
|
|
|
* these holds indexes of each transport into global array
|
|
|
|
* proc -> id, where id can be btl id in yoda or mxm ptl id
|
2015-06-24 06:59:57 +03:00
|
|
|
* in ikrit
|
2013-09-10 19:34:09 +04:00
|
|
|
* spml is supposed to fill this during add_procs()
|
|
|
|
**/
|
|
|
|
int num_transports;
|
|
|
|
char *transport_ids;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct oshmem_proc_t oshmem_proc_t;
|
|
|
|
OBJ_CLASS_DECLARATION(oshmem_proc_t);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Group of Open SHMEM processes structure
|
2015-06-24 06:59:57 +03:00
|
|
|
*
|
2013-09-10 19:34:09 +04:00
|
|
|
* Set of processes used in collective operations.
|
|
|
|
*/
|
|
|
|
struct oshmem_group_t {
|
|
|
|
opal_object_t base;
|
|
|
|
int id; /**< index in global array */
|
|
|
|
int my_pe;
|
|
|
|
int proc_count; /**< number of processes in group */
|
|
|
|
int is_member; /* true if my_pe is part of the group, participate in collectives */
|
|
|
|
struct oshmem_proc_t **proc_array; /**< list of pointers to ompi_proc_t structures
|
|
|
|
for each process in the group */
|
|
|
|
opal_list_t peer_list;
|
|
|
|
|
|
|
|
/* Collectives module interface and data */
|
|
|
|
mca_scoll_base_group_scoll_t g_scoll;
|
2014-02-25 19:01:10 +04:00
|
|
|
ompi_communicator_t* ompi_comm;
|
2013-09-10 19:34:09 +04:00
|
|
|
};
|
|
|
|
typedef struct oshmem_group_t oshmem_group_t;
|
|
|
|
OSHMEM_DECLSPEC OBJ_CLASS_DECLARATION(oshmem_group_t);
|
|
|
|
|
|
|
|
OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_all;
|
|
|
|
OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_self;
|
|
|
|
OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_null;
|
|
|
|
|
|
|
|
|
|
|
|
/* ******************************************************************** */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initialize the OSHMEM process subsystem
|
|
|
|
*
|
|
|
|
* Initialize the Open SHMEM process subsystem. This function will
|
|
|
|
* query the run-time environment and build a list of the proc
|
|
|
|
* instances in the current pe set. The local information not
|
|
|
|
* easily determined by the run-time ahead of time (architecture and
|
|
|
|
* hostname) will be published during this call.
|
|
|
|
*
|
|
|
|
* @note While an oshmem_proc_t will exist with mostly valid information
|
|
|
|
* for each process in the pe set at the conclusion of this
|
|
|
|
* call, some information will not be immediately available. This
|
|
|
|
* includes the architecture and hostname, which will be available by
|
|
|
|
* the conclusion of the stage gate.
|
|
|
|
*
|
|
|
|
* @retval OSHMEM_SUCESS System successfully initialized
|
|
|
|
* @retval OSHMEM_ERROR Initialization failed due to unspecified error
|
|
|
|
*/
|
|
|
|
OSHMEM_DECLSPEC int oshmem_proc_init(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finalize the OSHMEM Process subsystem
|
|
|
|
*
|
|
|
|
* Finalize the Open SHMEM process subsystem. This function will
|
|
|
|
* release all memory created during the life of the application,
|
|
|
|
* including all oshmem_proc_t structures.
|
|
|
|
*
|
|
|
|
* @retval OSHMEM_SUCCESS System successfully finalized
|
|
|
|
*/
|
|
|
|
OSHMEM_DECLSPEC int oshmem_proc_finalize(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns a pointer to the local process
|
|
|
|
*
|
|
|
|
* Returns a pointer to the local process. Unlike oshmem_proc_self(),
|
|
|
|
* the reference count on the local proc instance is not modified by
|
|
|
|
* this function.
|
|
|
|
*
|
|
|
|
* @return Pointer to the local process structure
|
|
|
|
*/
|
2015-09-18 17:40:21 +03:00
|
|
|
static inline oshmem_proc_t *oshmem_proc_local(void)
|
2013-09-10 19:34:09 +04:00
|
|
|
{
|
2015-09-17 18:20:37 +03:00
|
|
|
return (oshmem_proc_t *)ompi_proc_local_proc;
|
2013-09-10 19:34:09 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2015-06-24 06:59:57 +03:00
|
|
|
* Returns the proc instance for a given name
|
2013-09-10 19:34:09 +04:00
|
|
|
*
|
|
|
|
* Returns the proc instance for the specified process name. The
|
|
|
|
* reference count for the proc instance is not incremented by this
|
|
|
|
* function.
|
|
|
|
*
|
|
|
|
* @param[in] name The process name to look for
|
|
|
|
*
|
|
|
|
* @return Pointer to the process instance for \c name
|
|
|
|
*/
|
2015-09-18 17:40:21 +03:00
|
|
|
static inline oshmem_proc_t *oshmem_proc_for_find(const orte_process_name_t name)
|
2015-09-17 18:20:37 +03:00
|
|
|
{
|
2015-09-18 19:20:58 +03:00
|
|
|
return (oshmem_proc_t *)ompi_proc_for_name(name);
|
2015-09-18 17:40:21 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline oshmem_proc_t *oshmem_proc_find(int pe)
|
|
|
|
{
|
|
|
|
orte_process_name_t name;
|
|
|
|
|
|
|
|
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
|
|
|
name.vpid = pe;
|
|
|
|
return oshmem_proc_for_find(name);
|
2015-09-17 18:20:37 +03:00
|
|
|
}
|
2013-09-10 19:34:09 +04:00
|
|
|
|
|
|
|
static inline int oshmem_proc_pe(oshmem_proc_t *proc)
|
|
|
|
{
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
return (proc ? (int) ((orte_process_name_t*)&proc->super.proc_name)->vpid : -1);
|
2013-09-10 19:34:09 +04:00
|
|
|
}
|
|
|
|
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
#define OSHMEM_PROC_JOBID(PROC) (((orte_process_name_t*)&((PROC)->super.proc_name))->jobid)
|
|
|
|
#define OSHMEM_PROC_VPID(PROC) (((orte_process_name_t*)&((PROC)->super.proc_name))->vpid)
|
|
|
|
|
2013-09-10 19:34:09 +04:00
|
|
|
/**
|
|
|
|
* Initialize the OSHMEM process predefined groups
|
|
|
|
*
|
|
|
|
* Initialize the Open SHMEM process predefined groups. This function will
|
|
|
|
* query the run-time environment and build a list of the proc
|
|
|
|
* instances in the current pe set. The local information not
|
|
|
|
* easily determined by the run-time ahead of time (architecture and
|
|
|
|
* hostname) will be published during this call.
|
|
|
|
*
|
|
|
|
* @note This is primarily used once during SHMEM setup.
|
|
|
|
*
|
|
|
|
* @retval OSHMEM_SUCESS System successfully initialized
|
|
|
|
* @retval OSHMEM_ERROR Initialization failed due to unspecified error
|
|
|
|
*/
|
|
|
|
OSHMEM_DECLSPEC int oshmem_proc_group_init(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Finalize the OSHMEM process predefined groups
|
|
|
|
*
|
|
|
|
* Initialize the Open SHMEM process predefined groups. This function will
|
|
|
|
* query the run-time environment and build a list of the proc
|
|
|
|
* instances in the current pe set. The local information not
|
|
|
|
* easily determined by the run-time ahead of time (architecture and
|
|
|
|
* hostname) will be published during this call.
|
|
|
|
*
|
|
|
|
* @note This is primarily used once during SHMEM setup.
|
|
|
|
*
|
|
|
|
* @retval OSHMEM_SUCESS System successfully initialized
|
|
|
|
* @retval OSHMEM_ERROR Initialization failed due to unspecified error
|
|
|
|
*/
|
|
|
|
OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Create processes group.
|
|
|
|
*
|
|
|
|
* Returns the list of known proc instances located in this group.
|
|
|
|
*
|
|
|
|
* @param[in] pe_start The lowest PE in the active set.
|
2015-06-24 06:59:57 +03:00
|
|
|
* @param[in] pe_stride The log (base 2) of the stride between consecutive
|
2013-09-10 19:34:09 +04:00
|
|
|
* PEs in the active set.
|
|
|
|
* @param[in] pe_size The number of PEs in the active set.
|
|
|
|
*
|
|
|
|
* @return Array of pointers to proc instances in the current
|
|
|
|
* known universe, or NULL if there is an internal failure.
|
|
|
|
*/
|
2015-09-18 17:40:21 +03:00
|
|
|
OSHMEM_DECLSPEC oshmem_group_t *oshmem_proc_group_create(int pe_start,
|
2013-09-10 19:34:09 +04:00
|
|
|
int pe_stride,
|
|
|
|
size_t pe_size);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destroy processes group.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
OSHMEM_DECLSPEC void oshmem_proc_group_destroy(oshmem_group_t* group);
|
|
|
|
|
|
|
|
static inline oshmem_proc_t *oshmem_proc_group_all(int pe)
|
|
|
|
{
|
|
|
|
return oshmem_group_all->proc_array[pe];
|
|
|
|
}
|
|
|
|
|
2015-09-18 17:40:21 +03:00
|
|
|
static inline oshmem_proc_t *oshmem_proc_group_find(oshmem_group_t* group,
|
2013-09-10 19:34:09 +04:00
|
|
|
int pe)
|
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
oshmem_proc_t* proc = NULL;
|
|
|
|
|
|
|
|
if (OPAL_LIKELY(group)) {
|
|
|
|
if (OPAL_LIKELY(group == oshmem_group_all)) {
|
|
|
|
/* To improve performance use direct index. It is feature of oshmem_group_all */
|
|
|
|
proc = group->proc_array[pe];
|
|
|
|
} else {
|
|
|
|
for (i = 0; i < group->proc_count; i++) {
|
|
|
|
if (pe == oshmem_proc_pe(group->proc_array[i])) {
|
|
|
|
proc = group->proc_array[i];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
orte_process_name_t name;
|
|
|
|
|
|
|
|
name.jobid = ORTE_PROC_MY_NAME->jobid;
|
|
|
|
name.vpid = pe;
|
2015-09-18 17:40:21 +03:00
|
|
|
proc = oshmem_proc_for_find(name);
|
2013-09-10 19:34:09 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return proc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int oshmem_proc_group_find_id(oshmem_group_t* group, int pe)
|
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
int id = -1;
|
|
|
|
|
|
|
|
if (group) {
|
|
|
|
for (i = 0; i < group->proc_count; i++) {
|
|
|
|
if (pe == oshmem_proc_pe(group->proc_array[i])) {
|
|
|
|
id = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int oshmem_proc_group_is_member(oshmem_group_t *group)
|
|
|
|
{
|
|
|
|
return group->is_member;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int oshmem_num_procs(void)
|
|
|
|
{
|
2015-09-18 17:40:21 +03:00
|
|
|
return (oshmem_group_all ?
|
2015-09-18 19:20:58 +03:00
|
|
|
oshmem_group_all->proc_count : (int)opal_list_get_size(&ompi_proc_list));
|
2013-09-10 19:34:09 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int oshmem_my_proc_id(void)
|
|
|
|
{
|
|
|
|
return oshmem_group_self->my_pe;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int oshmem_get_transport_id(int pe)
|
|
|
|
{
|
|
|
|
oshmem_proc_t *proc;
|
|
|
|
|
|
|
|
proc = oshmem_proc_group_find(oshmem_group_all, pe);
|
|
|
|
|
|
|
|
return (int) proc->transport_ids[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int oshmem_get_transport_count(int pe)
|
|
|
|
{
|
|
|
|
oshmem_proc_t *proc;
|
|
|
|
proc = oshmem_proc_group_find(oshmem_group_all, pe);
|
|
|
|
return proc->num_transports;
|
|
|
|
}
|
|
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
|
|
|
|
#endif /* OSHMEM_PROC_PROC_H */
|