b01ebf45c9
and compiler warning issues. Fixed threaded build issue. This commit was SVN r6819.
509 строки
18 KiB
C
509 строки
18 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file
|
|
*
|
|
* Bit Mover Interface (BTL)
|
|
*
|
|
*
|
|
* BTL Initialization:
|
|
*
|
|
* During library initialization, all available BTL components are
|
|
* loaded and opened via their mca_base_open_component_fn_t
|
|
* function. The BTL open function should register any mca parameters
|
|
* used to tune/adjust the behaviour of the BTL (mca_base_param_register_int(),
|
|
* mca_base_param_register_string()). Note that the open function may fail
|
|
* if the resources (e.g. shared libraries, etc) required by the network
|
|
* transport are not available.
|
|
*
|
|
* The mca_btl_base_component_init_fn_t() is then called for each of the
|
|
* components that are succesfully opened. The component init function may
|
|
* return either:
|
|
*
|
|
* (1) a NULL list of BTL modules if the transport is not available,
|
|
* (2) a list containing a single BTL module, where the BTL provides
|
|
* a layer of abstraction over multiple physical devices (e.g. NICs),
|
|
* (3) a list containing multiple BTL modules where each BTL module
|
|
* corresponds to a single physical device.
|
|
*
|
|
* During module initialization, the module should post any addressing
|
|
* information required by its peers. An example would be the TCP
|
|
* listen port opened by the TCP module for incoming connection
|
|
* requests. This information is published to peers via the
|
|
* mca_pml_base_modex_send() interface. Note that peer information is not
|
|
* guaranteed to be available via mca_pml_base_modex_recv() during the
|
|
* module's init function. However, it will be available during
|
|
* BTL selection (mca_btl_base_add_proc_fn_t()).
|
|
*
|
|
* BTL Selection:
|
|
*
|
|
* The upper layer builds an ordered list of the available BTL modules sorted
|
|
* by their exclusivity ranking. This is a relative ranking that is used
|
|
* to determine the set of BTLs that may be used to reach a given destination.
|
|
* During startup the BTL modules are queried via their
|
|
* mca_btl_base_add_proc_fn_t() to determine if they are able to reach
|
|
* a given destination. The BTL module with the highest ranking that
|
|
* returns success is selected. Subsequent BTL modules are selected only
|
|
* if they have the same exclusivity ranking.
|
|
*
|
|
* An example of how this might be used:
|
|
*
|
|
* BTL Exclusivity Comments
|
|
* -------- ----------- ------------------
|
|
* LO 100 Selected exclusively for local process
|
|
* SM 50 Selected exclusively for other processes on host
|
|
* IB 0 Selected based on network reachability
|
|
* IB 0 Selected based on network reachability
|
|
* TCP 0 Selected based on network reachability
|
|
* TCP 0 Selected based on network reachability
|
|
*
|
|
* When a BTL module is selected, it may choose to optionally return a
|
|
* pointer to an an mca_btl_base_endpoint_t data structure to the PML.
|
|
* This pointer is treated as an opaque handle by the PML and is
|
|
* returned to the BTL on subsequent data transfer calls to the
|
|
* corresponding destination process. The actual contents of the
|
|
* data structure are defined on a per BTL basis, and may be used to
|
|
* cache addressing or connection information, such as a TCP socket
|
|
* or IB queue pair.
|
|
*
|
|
* Progress:
|
|
*
|
|
* By default, the library provides for polling based progress of outstanding
|
|
* requests. The BTL component exports an interface function (btlm_progress)
|
|
* that is called in a polling mode by the PML during calls into the MPI
|
|
* library. Note that the btlm_progress() function is called on the BTL component
|
|
* rather than each BTL module. This implies that the BTL author is responsible
|
|
* for iterating over the pending operations in each of the BTL modules associated
|
|
* with the component.
|
|
*
|
|
* On platforms where threading support is provided, the library provides the
|
|
* option of building with asynchronous threaded progress. In this case, the BTL
|
|
* author is responsible for providing a thread to progress pending operations.
|
|
* A thread is associated with the BTL component/module such that transport specific
|
|
* functionality/APIs may be used to block the thread ubtll a pending operation
|
|
* completes. This thread MUST NOT poll for completion as this would oversubscribe
|
|
* the CPU.
|
|
*
|
|
* Note that in the threaded case the PML may choose to use a hybrid approach,
|
|
* such that polling is implemented from the user thread for a fixed number of
|
|
* cycles before relying on the background thread(s) to complete requests. If
|
|
* possible the BTL should support the use of both modes concurrebtly.
|
|
*
|
|
*/
|
|
|
|
#include "mca/mca.h"
|
|
|
|
#ifndef MCA_BTL_H
|
|
#define MCA_BTL_H
|
|
|
|
#include "include/types.h"
|
|
#include "class/ompi_free_list.h"
|
|
|
|
/*
|
|
* BTL types
|
|
*/
|
|
|
|
struct mca_btl_base_module_t;
|
|
struct mca_btl_base_endpoint_t;
|
|
struct mca_btl_base_descriptor_t;
|
|
struct mca_mpool_base_resources_t;
|
|
struct ompi_proc_t;
|
|
|
|
|
|
/* send/recv operations require tag matching */
|
|
typedef uint8_t mca_btl_base_tag_t;
|
|
|
|
/* reserved tag values */
|
|
#define MCA_BTL_TAG_BTL 0
|
|
#define MCA_BTL_TAG_PML 1
|
|
#define MCA_BTL_TAG_USR 2
|
|
#define MCA_BTL_TAG_MAX 255 /* 1 + highest allowed tag num */
|
|
|
|
/* prefered protocol */
|
|
#define MCA_BTL_FLAGS_SEND 1
|
|
#define MCA_BTL_FLAGS_RDMA 2
|
|
|
|
|
|
/**
|
|
* Asynchronous callback function on completion of an operation.
|
|
*/
|
|
|
|
typedef void (*mca_btl_base_completion_fn_t)(
|
|
struct mca_btl_base_module_t*,
|
|
struct mca_btl_base_endpoint_t*,
|
|
struct mca_btl_base_descriptor_t*,
|
|
int status);
|
|
|
|
|
|
/**
|
|
* Describes a region/segment of memory that is addressable
|
|
* by an BTL.
|
|
*/
|
|
|
|
struct mca_btl_base_segment_t {
|
|
ompi_ptr_t seg_addr;
|
|
uint32_t seg_len;
|
|
union {
|
|
uint32_t key32[2];
|
|
uint64_t key64;
|
|
uint8_t key8[8];
|
|
} seg_key;
|
|
};
|
|
typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
|
|
|
|
/**
|
|
* A descriptor that holds the parameters to a send/put/get
|
|
* operation along w/ a callback routine that is called on
|
|
* completion of the request.
|
|
*/
|
|
|
|
struct mca_btl_base_descriptor_t {
|
|
ompi_free_list_item_t super;
|
|
mca_btl_base_segment_t *des_src;
|
|
size_t des_src_cnt;
|
|
mca_btl_base_segment_t *des_dst;
|
|
size_t des_dst_cnt;
|
|
mca_btl_base_completion_fn_t des_cbfunc;
|
|
void* des_cbdata;
|
|
void* des_context;
|
|
int32_t des_flags;
|
|
};
|
|
typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t;
|
|
|
|
OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
|
|
|
|
|
|
#define MCA_BTL_DES_FLAGS_DEREGISTER 0x0001
|
|
#define MCA_BTL_DES_FLAGS_PRIORITY 0x0002
|
|
|
|
/**
|
|
* Maximum number of allowed segments in src/dst fields of a descriptor.
|
|
*/
|
|
#define MCA_BTL_DES_MAX_SEGMENTS 16
|
|
|
|
|
|
/*
|
|
* BTL base header, stores the tag at a minimum
|
|
*/
|
|
struct mca_btl_base_header_t{
|
|
mca_btl_base_tag_t tag;
|
|
};
|
|
typedef struct mca_btl_base_header_t mca_btl_base_header_t;
|
|
|
|
/*
|
|
* BTL component interface functions and datatype.
|
|
*/
|
|
|
|
/**
|
|
* MCA->BTL Initializes the BTL component and creates specific BTL
|
|
* module(s).
|
|
*
|
|
* @param num_btls (OUT) Returns the number of btl modules created, or 0
|
|
* if the transport is not available.
|
|
*
|
|
* @param enable_progress_threads (IN) Whether this component is
|
|
* allowed to run a hidden/progress thread or not.
|
|
*
|
|
* @param enable_mpi_threads (IN) Whether support for multiple MPI
|
|
* threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
|
|
* indicates whether multiple threads may invoke this component
|
|
* simultaneously or not.
|
|
*
|
|
* @return Array of pointers to BTL modules, or NULL if the transport
|
|
* is not available.
|
|
*
|
|
* During component initialization, the BTL component should discover
|
|
* the physical devices that are available for the given transport,
|
|
* and create a BTL module to represent each device. Any addressing
|
|
* information required by peers to reach the device should be published
|
|
* during this function via the mca_pml_base_modex_send() interface.
|
|
*
|
|
*/
|
|
|
|
typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)(
|
|
int *num_btls,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads
|
|
);
|
|
|
|
/**
|
|
* MCA->BTL Called to progress outstanding requests for
|
|
* non-threaded polling environments.
|
|
*
|
|
* @param tstamp Current time.
|
|
* @return OMPI_SUCCESS or error code on failure.
|
|
*/
|
|
|
|
typedef int (*mca_btl_base_component_progress_fn_t)(void);
|
|
|
|
|
|
/**
|
|
* BTL component descriptor. Contains component version information
|
|
* and component open/close/init functions.
|
|
*/
|
|
|
|
struct mca_btl_base_component_1_0_0_t {
|
|
mca_base_component_t btl_version;
|
|
mca_base_component_data_1_0_0_t btl_data;
|
|
mca_btl_base_component_init_fn_t btl_init;
|
|
mca_btl_base_component_progress_fn_t btl_progress;
|
|
};
|
|
typedef struct mca_btl_base_component_1_0_0_t mca_btl_base_component_1_0_0_t;
|
|
typedef struct mca_btl_base_component_1_0_0_t mca_btl_base_component_t;
|
|
|
|
|
|
/*
|
|
* BTL module interface functions and datatype.
|
|
*/
|
|
|
|
/**
|
|
* MCA->BTL Clean up any resources held by BTL module
|
|
* before the module is unloaded.
|
|
*
|
|
* @param btl (IN) BTL module.
|
|
*
|
|
* Prior to unloading a BTL module, the MCA framework will call
|
|
* the BTL finalize method of the module. Any resources held by
|
|
* the BTL should be released and if required the memory corresponding
|
|
* to the BTL module freed.
|
|
*
|
|
*/
|
|
typedef int (*mca_btl_base_module_finalize_fn_t)(
|
|
struct mca_btl_base_module_t* btl
|
|
);
|
|
|
|
/**
|
|
* PML->BTL notification of change in the process list.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param nprocs (IN) Number of processes
|
|
* @param procs (IN) Set of processes
|
|
* @param endpoint (OUT) Set of (optional) mca_btl_base_endpoint_t structures by BTL.
|
|
* @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL.
|
|
* @return OMPI_SUCCESS or error status on failure.
|
|
*
|
|
* The mca_btl_base_module_add_procs_fn_t() is called by the PML to
|
|
* determine the set of BTLs that should be used to reach each process.
|
|
* Any addressing information exported by the peer via the mca_pml_base_modex_send()
|
|
* function should be available during this call via the corresponding
|
|
* mca_pml_base_modex_recv() function. The BTL may utilize this information to
|
|
* determine reachability of each peer process.
|
|
*
|
|
* For each process that is reachable by the BTL, the bit corresponding to the index
|
|
* into the proc array (nprocs) should be set in the reachable bitmask. The PML
|
|
* provides the BTL the option to return a pointer to a data structure defined
|
|
* by the BTL that is returned to the BTL on subsequent calls to the BTL data
|
|
* transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing
|
|
* or connection information (e.g. TCP socket, IP queue pair).
|
|
*/
|
|
typedef int (*mca_btl_base_module_add_procs_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs,
|
|
struct mca_btl_base_endpoint_t** endpoints,
|
|
struct ompi_bitmap_t* reachable
|
|
);
|
|
|
|
/**
|
|
* Notification of change to the process list.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param nprocs (IN) Number of processes
|
|
* @param proc (IN) Set of processes
|
|
* @param peer (IN) Set of peer addressing information.
|
|
* @return Status indicating if cleanup was successful
|
|
*
|
|
* When the process list changes, the PML notifies the BTL of the
|
|
* change, to provide the opportunity to cleanup or release any
|
|
* resources associated with the peer.
|
|
*/
|
|
typedef int (*mca_btl_base_module_del_procs_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs,
|
|
struct mca_btl_base_endpoint_t**
|
|
);
|
|
|
|
/**
|
|
* Callback function that is called asynchronously on receipt
|
|
* of data by the transport layer.
|
|
*/
|
|
|
|
typedef void (*mca_btl_base_module_recv_cb_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
mca_btl_base_tag_t tag,
|
|
mca_btl_base_descriptor_t* descriptor,
|
|
void* cbdata
|
|
);
|
|
|
|
|
|
|
|
/**
|
|
* Register a callback function that is called on receipt
|
|
* of a fragment.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @return Status indicating if cleanup was successful
|
|
*
|
|
* When the process list changes, the PML notifies the BTL of the
|
|
* change, to provide the opportunity to cleanup or release any
|
|
* resources associated with the peer.
|
|
*/
|
|
typedef int (*mca_btl_base_module_register_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
mca_btl_base_tag_t tag,
|
|
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
|
void* cbdata
|
|
);
|
|
|
|
|
|
/**
|
|
* Allocate a descriptor with a segment of the requested size.
|
|
* Note that the BTL layer may choose to return a smaller size
|
|
* if it cannot support the request.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param size (IN) Request segment size.
|
|
*/
|
|
typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
size_t size
|
|
);
|
|
|
|
/**
|
|
* Return a descriptor allocated from this BTL via alloc/prepare.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param segment (IN) Descriptor allocated from the BTL
|
|
*/
|
|
typedef int (*mca_btl_base_module_free_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
mca_btl_base_descriptor_t* descriptor
|
|
);
|
|
|
|
|
|
/**
|
|
* Prepare a descriptor for send/rdma using the supplied
|
|
* convertor. If the convertor references data that is contigous,
|
|
* the descriptor may simply point to the user buffer. Otherwise,
|
|
* this routine is responsible for allocating buffer space and
|
|
* packing if required.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param endpoint (IN) BTL peer addressing
|
|
* @param convertor (IN) Data type convertor
|
|
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
|
|
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
|
|
*/
|
|
typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
mca_mpool_base_registration_t* registration,
|
|
struct ompi_convertor_t* convertor,
|
|
size_t reserve,
|
|
size_t* size
|
|
);
|
|
|
|
/**
|
|
* Initiate an asynchronous send.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param endpoint (IN) BTL addressing information
|
|
* @param descriptor (IN) Description of the data to be transfered
|
|
* @param tag (IN) The tag value used to notify the peer.
|
|
*/
|
|
typedef int (*mca_btl_base_module_send_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
mca_btl_base_tag_t tag
|
|
);
|
|
|
|
|
|
/**
|
|
* Initiate an asynchronous put.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param endpoint (IN) BTL addressing information
|
|
* @param descriptor (IN) Description of the data to be transferred
|
|
*/
|
|
|
|
typedef int (*mca_btl_base_module_put_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
struct mca_btl_base_descriptor_t* descriptor
|
|
);
|
|
|
|
/**
|
|
* Initiate an asynchronous get.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param endpoint (IN) BTL addressing information
|
|
* @param descriptor (IN) Description of the data to be transferred
|
|
*
|
|
*/
|
|
|
|
typedef int (*mca_btl_base_module_get_fn_t)(
|
|
struct mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
struct mca_btl_base_descriptor_t* descriptor
|
|
);
|
|
|
|
|
|
/**
|
|
* BTL module interface functions and attributes.
|
|
*/
|
|
struct mca_btl_base_module_t {
|
|
|
|
/* BTL common attributes */
|
|
mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */
|
|
size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */
|
|
size_t btl_min_send_size; /**< threshold below which the BTL should not fragment */
|
|
size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */
|
|
size_t btl_min_rdma_size; /**< threshold below which the BTL should not fragment */
|
|
size_t btl_max_rdma_size; /**< maximum rdma fragment size supported by the BTL */
|
|
uint32_t btl_exclusivity; /**< indicates this BTL should be used exclusively */
|
|
uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */
|
|
uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
|
|
uint32_t btl_flags; /**< flags (put/get...) */
|
|
|
|
/* BTL function table */
|
|
mca_btl_base_module_add_procs_fn_t btl_add_procs;
|
|
mca_btl_base_module_del_procs_fn_t btl_del_procs;
|
|
mca_btl_base_module_register_fn_t btl_register;
|
|
mca_btl_base_module_finalize_fn_t btl_finalize;
|
|
|
|
mca_btl_base_module_alloc_fn_t btl_alloc;
|
|
mca_btl_base_module_free_fn_t btl_free;
|
|
mca_btl_base_module_prepare_fn_t btl_prepare_src;
|
|
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
|
|
mca_btl_base_module_send_fn_t btl_send;
|
|
mca_btl_base_module_put_fn_t btl_put;
|
|
mca_btl_base_module_get_fn_t btl_get;
|
|
};
|
|
typedef struct mca_btl_base_module_t mca_btl_base_module_t;
|
|
|
|
/*
|
|
* Macro for use in modules that are of type btl v1.0.0
|
|
*/
|
|
#define MCA_BTL_BASE_VERSION_1_0_0 \
|
|
/* coll v1.0 is chained to MCA v1.0 */ \
|
|
MCA_BASE_VERSION_1_0_0, \
|
|
/* btl v1.0 */ \
|
|
"btl", 1, 0, 0
|
|
|
|
#endif /* OMPI_MCA_BTL_H */
|