356 строки
14 KiB
C
356 строки
14 KiB
C
|
/*
|
||
|
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||
|
*
|
||
|
* $COPYRIGHT$
|
||
|
*
|
||
|
* Additional copyrights may follow
|
||
|
*
|
||
|
* $HEADER$
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* @file
|
||
|
*
|
||
|
* This interface is designed to hide the back-end details of how IB
|
||
|
* RC connections are made from the rest of the wv BTL. There are
|
||
|
* module-like instances of the implemented functionality (dlopen and
|
||
|
* friends are not used, but all the functionality is accessed through
|
||
|
* struct's of function pointers, so you can swap between multiple
|
||
|
* different implementations at run time, just like real components).
|
||
|
* Hence, these entities are referred to as "Connect
|
||
|
* Pseudo-Components" (CPCs).
|
||
|
*
|
||
|
* The CPCs are referenced by their names (e.g., "oob", "rdma_cm").
|
||
|
*
|
||
|
* CPCs are split into components and modules, similar to all other
|
||
|
* MCA frameworks in this code base.
|
||
|
*
|
||
|
* Before diving into the CPC interface, let's discuss some
|
||
|
* terminology and mappings of data structures:
|
||
|
*
|
||
|
* - a BTL module represents a network port (in the case of the wv
|
||
|
* BTL, a LID)
|
||
|
* - a CPC module represents one way to make connections to a BTL module
|
||
|
* - hence, a BTL module has potentially multiple CPC modules
|
||
|
* associated with it
|
||
|
* - an endpoint represnts a connection between a local BTL module and
|
||
|
* a remote BTL module (in the wv BTL, because of BSRQ, an
|
||
|
* endpoint can contain multiple QPs)
|
||
|
* - when an endpoint is created, one of the CPC modules associated
|
||
|
* with the local BTL is selected and associated with the endpoint
|
||
|
* (obviously, it is a CPC module that is common between the local
|
||
|
* and remote BTL modules)
|
||
|
* - endpoints may be created and destroyed during the MPI job
|
||
|
* - endpoints are created lazily, during the first communication
|
||
|
* between two peers
|
||
|
* - endpoints are destroyed when two MPI processes become
|
||
|
* disconnected (e.g., MPI-2 dynamics or MPI_FINALIZE)
|
||
|
* - hence, BTL modules and CPC modules outlive endpoints.
|
||
|
* Specifically, BTL modules and CPC modules live from MPI_INIT to
|
||
|
* MPI_FINALIZE. endpoints come and go as MPI semantics demand it.
|
||
|
* - therefore, CPC modules need to cache information on endpoints that
|
||
|
* are specific to that connection.
|
||
|
*
|
||
|
* Component interface:
|
||
|
*
|
||
|
* - component_register(): The wv BTL's component_open() function
|
||
|
* calls the connect_base_register() function, which scans all
|
||
|
* compiled-in CPC's. If they have component_register() functions,
|
||
|
* they are called (component_register() functions are only allowed to
|
||
|
* register MCA parameters).
|
||
|
*
|
||
|
* NOTE: The connect_base_register() function will process the
|
||
|
* btl_wv_cpc_include and btl_wv_cpc_exclude MCA parameters
|
||
|
* and automatically include/exclude CPCs as relevant. If a CPC is
|
||
|
* excluded, none of its other interface functions will be invoked for
|
||
|
* the duration of the process.
|
||
|
*
|
||
|
* - component_init(): The wv BTL's component_init() function
|
||
|
* calls connect_base_init(), which will invoke this query function on
|
||
|
* each CPC to see if it wants to run at all. CPCs can gracefully
|
||
|
* remove themselves from consideration in this process by returning
|
||
|
* OMPI_ERR_NOT_SUPPORTED.
|
||
|
*
|
||
|
* - component_query(): The wv BTL's init_one_port() calls the
|
||
|
* connect_base_select_for_local_port() function, which, for each LID
|
||
|
* on that port, calls the component_query() function on every
|
||
|
* available CPC on that LID. This function is intended to see if a
|
||
|
* CPC can run on a sepcific wv BTL module (i.e., LID). If it
|
||
|
* can, the CPC is supposed to create a CPC module that is specific to
|
||
|
* that BTL/LID and return it. If it cannot, it should return
|
||
|
* OMPI_ERR_NOT_SUPPORTED and be gracefully skipped for this
|
||
|
* OpenFabrics port.
|
||
|
*
|
||
|
* component_finalize(): The wv BTL's component_close() function
|
||
|
* calls connect_base_finalize(), which, in turn, calls the
|
||
|
* component_finalize() function on all available CPCs. Note that all
|
||
|
* CPC modules will have been finalized by this point; the CPC
|
||
|
* component_finalize() function is a chance for the CPC to clean up
|
||
|
* any component-specific resources.
|
||
|
*
|
||
|
* Module interface:
|
||
|
*
|
||
|
* cbm_component member: A pointer pointing to the single, global
|
||
|
* instance of the CPC component. This member is used for creating a
|
||
|
* unique index representing the modules' component so that it can be
|
||
|
* shared with remote peer processes.
|
||
|
*
|
||
|
* cbm_priority member: An integer between 0 and 100, inclusive,
|
||
|
* representing the priority of this CPC.
|
||
|
*
|
||
|
* cbm_modex_message member: A pointer to a blob buffer that will be
|
||
|
* included in the modex message for this port for this CPC (it is
|
||
|
* assumed that this blob is a) only understandable by the
|
||
|
* corresponding CPC in the peer process, and b) contains specific
|
||
|
* addressing/contact information for *this* port's CPC module).
|
||
|
*
|
||
|
* cbm_modex_message_len member: The length of the cbm_modex_message
|
||
|
* blob, in bytes.
|
||
|
*
|
||
|
* cbm_endpoint_init(): Called during endpoint creation, allowing a
|
||
|
* CPC module to cache information on the endpoint. A pointer to the
|
||
|
* endpoint's CPC module is already cached on the endpoint.
|
||
|
*
|
||
|
* cbm_start_connect(): initiate a connection to a remote peer. The
|
||
|
* CPC is responsible for setting itself up for asyncronous operation
|
||
|
* for progressing the outgoing connection request.
|
||
|
*
|
||
|
* cbm_endpoint_finalize(): Called during the endpoint destrouction,
|
||
|
* allowing the CPC module to destroy anything that it cached on the
|
||
|
* endpoint.
|
||
|
*
|
||
|
* cbm_finalize(): shut down all asynchronous handling and clean up
|
||
|
* any state that was setup for this CPC module/BTL. Some CPCs setup
|
||
|
* asynchronous support on a per-HCA/NIC basis (vs. per-port/LID). It
|
||
|
* is the reponsibility of the CPC to figure out such issues (e.g.,
|
||
|
* via reference counting) -- there is no notification from the
|
||
|
* upper-level BTL about when an entire HCA/NIC is no longer being
|
||
|
* used. There is only this function, which tells when a specific
|
||
|
* CPC/BTL module is no longer being used.
|
||
|
*
|
||
|
* cbm_uses_cts: a bool that indicates whether the CPC will use the
|
||
|
* CTS protocol or not.
|
||
|
* - if true: the CPC will post the fragment on
|
||
|
* endpoint->endpoint_cts_frag as a receive buffer and will *not*
|
||
|
* call ompi_btl_wv_post_recvs().
|
||
|
* - if false: the CPC will call ompi_btl_wv_post_recvs() before
|
||
|
* calling ompi_btl_wv_cpc_complete().
|
||
|
*
|
||
|
* There are two functions in the main wv BTL that the CPC may
|
||
|
* call:
|
||
|
*
|
||
|
* - ompi_btl_wv_post_recvs(endpoint): once a QP is locally
|
||
|
* connected to the remote side (but we don't know if the remote side
|
||
|
* is connected to us yet), this function is invoked to post buffers
|
||
|
* on the QP, setup credits for the endpoint, etc. This function is
|
||
|
* *only* invoked if the CPC's cbm_uses_cts is false.
|
||
|
*
|
||
|
* - ompi_btl_wv_cpc_complete(endpoint): once that a CPC knows
|
||
|
* that a QP is connected on *both* sides, this function is invoked to
|
||
|
* tell the main wv BTL "ok, you can use this connection now."
|
||
|
* (e.g., the main wv BTL will either invoke the CTS protocol or
|
||
|
* start sending out fragments that were queued while the connection
|
||
|
* was establishing, etc.).
|
||
|
*/
|
||
|
#ifndef BTL_WV_CONNECT_H
|
||
|
#define BTL_WV_CONNECT_H
|
||
|
|
||
|
BEGIN_C_DECLS
|
||
|
|
||
|
#define BCF_MAX_NAME 64
|
||
|
|
||
|
/**
|
||
|
* Must forward declare these structs to avoid include file loops.
|
||
|
*/
|
||
|
struct mca_btl_wv_hca_t;
|
||
|
struct mca_btl_wv_module_t;
|
||
|
struct mca_btl_base_endpoint_t;
|
||
|
|
||
|
/**
|
||
|
* This is struct is defined below
|
||
|
*/
|
||
|
struct ompi_btl_wv_connect_base_module_t;
|
||
|
|
||
|
/************************************************************************/
|
||
|
|
||
|
/**
|
||
|
* Function to register MCA params in the connect functions. It
|
||
|
* returns no value, so it cannot fail.
|
||
|
*/
|
||
|
typedef void (*ompi_btl_wv_connect_base_component_register_fn_t)(void);
|
||
|
|
||
|
/**
|
||
|
* This function is invoked once by the wv BTL component during
|
||
|
* startup. It is intended to have CPC component-wide startup.
|
||
|
*
|
||
|
* Return value:
|
||
|
*
|
||
|
* - OMPI_SUCCESS: this CPC component will be used in selection during
|
||
|
* this process.
|
||
|
*
|
||
|
* - OMPI_ERR_NOT_SUPPORTED: this CPC component will be silently
|
||
|
* ignored in this process.
|
||
|
*
|
||
|
* - Other OMPI_ERR_* values: the error will be propagated upwards,
|
||
|
* likely causing a fatal error (and/or the wv BTL component
|
||
|
* being ignored).
|
||
|
*/
|
||
|
typedef int (*ompi_btl_wv_connect_base_component_init_fn_t)(void);
|
||
|
|
||
|
/**
|
||
|
* Query the CPC to see if it wants to run on a specific port (i.e., a
|
||
|
* specific BTL module). If the component init function previously
|
||
|
* returned OMPI_SUCCESS, this function is invoked once per BTL module
|
||
|
* creation (i.e., for each port found by an MPI process). If this
|
||
|
* CPC wants to be used on this BTL module, it returns a CPC module
|
||
|
* that is specific to this BTL module.
|
||
|
*
|
||
|
* The BTL module in question is passed to the function; all of its
|
||
|
* attributes can be used to query to see if it's eligible for this
|
||
|
* CPC.
|
||
|
*
|
||
|
* If it is eligible, the CPC is responsible for creating a
|
||
|
* corresponding CPC module, filling in all the relevant fields on the
|
||
|
* modules, and for setting itself up to run (per above) and returning
|
||
|
* a CPC module (this is effectively the "module_init" function).
|
||
|
* Note that the module priority must be between 0 and 100
|
||
|
* (inclusive). When multiple CPCs are eligible for a single module,
|
||
|
* the CPC with the highest priority will be used.
|
||
|
*
|
||
|
* Return value:
|
||
|
*
|
||
|
* - OMPI_SUCCESS if this CPC is eligible for and was able to be setup
|
||
|
* for this BTL module. It is assumed that the CPC is now completely
|
||
|
* setup to run on this wv module (per description above).
|
||
|
*
|
||
|
* - OMPI_ERR_NOT_SUPPORTED if this CPC cannot support this BTL
|
||
|
* module. This is not an error; it's just the CPC saying "sorry, I
|
||
|
* cannot support this BTL module."
|
||
|
*
|
||
|
* - Other OMPI_ERR_* code: an error occurred.
|
||
|
*/
|
||
|
typedef int (*ompi_btl_wv_connect_base_func_component_query_t)
|
||
|
(struct mca_btl_wv_module_t *btl,
|
||
|
struct ompi_btl_wv_connect_base_module_t **cpc);
|
||
|
|
||
|
/**
|
||
|
* This function is invoked once by the wv BTL component during
|
||
|
* shutdown. It is intended to have CPC component-wide shutdown.
|
||
|
*/
|
||
|
typedef int (*ompi_btl_wv_connect_base_component_finalize_fn_t)(void);
|
||
|
|
||
|
/**
|
||
|
* CPC component struct
|
||
|
*/
|
||
|
struct ompi_btl_wv_connect_base_component_t {
|
||
|
/** Name of this set of connection functions */
|
||
|
char cbc_name[BCF_MAX_NAME];
|
||
|
|
||
|
/** Register function. Can be NULL. */
|
||
|
ompi_btl_wv_connect_base_component_register_fn_t cbc_register;
|
||
|
|
||
|
/** CPC component init function. Can be NULL. */
|
||
|
ompi_btl_wv_connect_base_component_init_fn_t cbc_init;
|
||
|
|
||
|
/** Query the CPC component to get a CPC module corresponding to
|
||
|
an wv BTL module. Cannot be NULL. */
|
||
|
ompi_btl_wv_connect_base_func_component_query_t cbc_query;
|
||
|
|
||
|
/** CPC component finalize function. Can be NULL. */
|
||
|
ompi_btl_wv_connect_base_component_finalize_fn_t cbc_finalize;
|
||
|
};
|
||
|
/**
|
||
|
* Convenience typedef
|
||
|
*/
|
||
|
typedef struct ompi_btl_wv_connect_base_component_t ompi_btl_wv_connect_base_component_t;
|
||
|
|
||
|
/************************************************************************/
|
||
|
|
||
|
/**
|
||
|
* Function called when an endpoint has been created and has been
|
||
|
* associated with a CPC.
|
||
|
*/
|
||
|
typedef int (*ompi_btl_wv_connect_base_module_endpoint_init_fn_t)
|
||
|
(struct mca_btl_base_endpoint_t *endpoint);
|
||
|
|
||
|
/**
|
||
|
* Function to initiate a connection to a remote process.
|
||
|
*/
|
||
|
typedef int (*ompi_btl_wv_connect_base_module_start_connect_fn_t)
|
||
|
(struct ompi_btl_wv_connect_base_module_t *cpc,
|
||
|
struct mca_btl_base_endpoint_t *endpoint);
|
||
|
|
||
|
/**
|
||
|
* Function called when an endpoint is being destroyed.
|
||
|
*/
|
||
|
typedef int (*ompi_btl_wv_connect_base_module_endpoint_finalize_fn_t)
|
||
|
(struct mca_btl_base_endpoint_t *endpoint);
|
||
|
|
||
|
/**
|
||
|
* Function to finalize the CPC module. It is called once when the
|
||
|
* CPC module's corresponding wv BTL module is being finalized.
|
||
|
*/
|
||
|
typedef int (*ompi_btl_wv_connect_base_module_finalize_fn_t)
|
||
|
(struct mca_btl_wv_module_t *btl,
|
||
|
struct ompi_btl_wv_connect_base_module_t *cpc);
|
||
|
|
||
|
/**
|
||
|
* Meta data about a CPC module. This is in a standalone struct
|
||
|
* because it is used in both the CPC module struct and the
|
||
|
* wv_btl_proc_t struct to hold information received from the
|
||
|
* modex.
|
||
|
*/
|
||
|
typedef struct ompi_btl_wv_connect_base_module_data_t {
|
||
|
/** Pointer back to the component. Used by the base and wv
|
||
|
btl to calculate this module's index for the modex. */
|
||
|
ompi_btl_wv_connect_base_component_t *cbm_component;
|
||
|
|
||
|
/** Priority of the CPC module (must be >=0 and <=100) */
|
||
|
uint8_t cbm_priority;
|
||
|
|
||
|
/** Blob that the CPC wants to include in the wv modex message
|
||
|
for a specific port, or NULL if the CPC does not want to
|
||
|
include a message in the modex. */
|
||
|
void *cbm_modex_message;
|
||
|
|
||
|
/** Length of the cbm_modex_message blob (0 if
|
||
|
cbm_modex_message==NULL). The message is intended to be short
|
||
|
(because the size of the modex broadcast is a function of
|
||
|
sum(cbm_modex_message_len[i]) for
|
||
|
i=(0...total_num_ports_in_MPI_job) -- e.g., IBCM imposes its
|
||
|
own [very short] limits (per IBTA volume 1, chapter 12). */
|
||
|
uint8_t cbm_modex_message_len;
|
||
|
} ompi_btl_wv_connect_base_module_data_t;
|
||
|
|
||
|
/**
|
||
|
* Struct for holding CPC module and associated meta data
|
||
|
*/
|
||
|
typedef struct ompi_btl_wv_connect_base_module_t {
|
||
|
/** Meta data about the module */
|
||
|
ompi_btl_wv_connect_base_module_data_t data;
|
||
|
|
||
|
/** Endpoint initialization function */
|
||
|
ompi_btl_wv_connect_base_module_endpoint_init_fn_t cbm_endpoint_init;
|
||
|
|
||
|
/** Connect function */
|
||
|
ompi_btl_wv_connect_base_module_start_connect_fn_t cbm_start_connect;
|
||
|
|
||
|
/** Endpoint finalization function */
|
||
|
ompi_btl_wv_connect_base_module_endpoint_finalize_fn_t cbm_endpoint_finalize;
|
||
|
|
||
|
/** Finalize the cpc module */
|
||
|
ompi_btl_wv_connect_base_module_finalize_fn_t cbm_finalize;
|
||
|
|
||
|
/** Whether this module will use the CTS protocol or not. This
|
||
|
directly states whether this module will call
|
||
|
mca_btl_wv_endpoint_post_recvs() or not: true = this
|
||
|
module will *not* call _post_recvs() and instead will post the
|
||
|
receive buffer provided at endpoint->endpoint_cts_frag on qp
|
||
|
0. */
|
||
|
bool cbm_uses_cts;
|
||
|
} ompi_btl_wv_connect_base_module_t;
|
||
|
|
||
|
END_C_DECLS
|
||
|
|
||
|
#endif
|