/* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file * * Bit Mover Interface (BTL) * * * BTL Initialization: * * During library initialization, all available BTL components are * loaded and opened via their mca_base_open_component_fn_t * function. The BTL open function should register any mca parameters * used to tune/adjust the behaviour of the BTL (mca_base_param_register_int(), * mca_base_param_register_string()). Note that the open function may fail * if the resources (e.g. shared libraries, etc) required by the network * transport are not available. * * The mca_btl_base_component_init_fn_t() is then called for each of the * components that are succesfully opened. The component init function may * return either: * * (1) a NULL list of BTL modules if the transport is not available, * (2) a list containing a single BTL module, where the BTL provides * a layer of abstraction over multiple physical devices (e.g. NICs), * (3) a list containing multiple BTL modules where each BTL module * corresponds to a single physical device. * * During module initialization, the module should post any addressing * information required by its peers. An example would be the TCP * listen port opened by the TCP module for incoming connection * requests. This information is published to peers via the * mca_base_modex_send() interface. Note that peer information is not * guaranteed to be available via mca_base_modex_recv() during the * module's init function. However, it will be available during * BTL selection (mca_btl_base_add_proc_fn_t()). * * BTL Selection: * * The upper layer builds an ordered list of the available BTL modules sorted * by their exclusivity ranking. This is a relative ranking that is used * to determine the set of BTLs that may be used to reach a given destination. * During startup the BTL modules are queried via their * mca_btl_base_add_proc_fn_t() to determine if they are able to reach * a given destination. The BTL module with the highest ranking that * returns success is selected. Subsequent BTL modules are selected only * if they have the same exclusivity ranking. * * An example of how this might be used: * * BTL Exclusivity Comments * -------- ----------- ------------------ * LO 100 Selected exclusively for local process * SM 50 Selected exclusively for other processes on host * IB 0 Selected based on network reachability * IB 0 Selected based on network reachability * TCP 0 Selected based on network reachability * TCP 0 Selected based on network reachability * * When a BTL module is selected, it may choose to optionally return a * pointer to an an mca_btl_base_endpoint_t data structure to the PML. * This pointer is treated as an opaque handle by the PML and is * returned to the BTL on subsequent data transfer calls to the * corresponding destination process. The actual contents of the * data structure are defined on a per BTL basis, and may be used to * cache addressing or connection information, such as a TCP socket * or IB queue pair. * * Progress: * * By default, the library provides for polling based progress of outstanding * requests. The BTL component exports an interface function (btlm_progress) * that is called in a polling mode by the PML during calls into the MPI * library. Note that the btlm_progress() function is called on the BTL component * rather than each BTL module. This implies that the BTL author is responsible * for iterating over the pending operations in each of the BTL modules associated * with the component. * * On platforms where threading support is provided, the library provides the * option of building with asynchronous threaded progress. In this case, the BTL * author is responsible for providing a thread to progress pending operations. * A thread is associated with the BTL component/module such that transport specific * functionality/APIs may be used to block the thread ubtll a pending operation * completes. This thread MUST NOT poll for completion as this would oversubscribe * the CPU. * * Note that in the threaded case the PML may choose to use a hybrid approach, * such that polling is implemented from the user thread for a fixed number of * cycles before relying on the background thread(s) to complete requests. If * possible the BTL should support the use of both modes concurrebtly. * */ #include "mca/mca.h" #ifndef MCA_BTL_H #define MCA_BTL_H #include "include/types.h" #include "class/ompi_free_list.h" /* * BTL types */ struct mca_btl_base_module_t; struct mca_btl_base_endpoint_t; struct mca_btl_base_descriptor_t; struct mca_mpool_base_resources_t; /* send/recv operations require tag matching */ typedef uint8_t mca_btl_base_tag_t; /* reserved tag values */ #define MCA_BTL_TAG_BTL 0 #define MCA_BTL_TAG_PML 1 #define MCA_BTL_TAG_USR 2 #define MCA_BTL_TAG_MAX 256 /* 1 + highest allowed tag num */ /* prefered protocol */ #define MCA_BTL_FLAGS_SEND 1 #define MCA_BTL_FLAGS_RDMA 2 /** * Asynchronous callback function on completion of an operation. */ typedef void (*mca_btl_base_completion_fn_t)( struct mca_btl_base_module_t*, struct mca_btl_base_endpoint_t*, struct mca_btl_base_descriptor_t*, int status); /** * Describes a region/segment of memory that is addressable * by an BTL. */ struct mca_btl_base_segment_t { ompi_ptr_t seg_addr; uint32_t seg_len; union { uint32_t key32[2]; uint64_t key64; uint8_t key8[8]; } seg_key; }; typedef struct mca_btl_base_segment_t mca_btl_base_segment_t; /** * A descriptor that holds the parameters to a send/put/get * operation along w/ a callback routine that is called on * completion of the request. */ struct mca_btl_base_descriptor_t { ompi_free_list_item_t super; mca_btl_base_segment_t *des_src; size_t des_src_cnt; mca_btl_base_segment_t *des_dst; size_t des_dst_cnt; mca_btl_base_completion_fn_t des_cbfunc; void* des_cbdata; int32_t des_flags; }; typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t; OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t); #define MCA_BTL_DES_FLAGS_DEREGISTER 0x0001 #define MCA_BTL_DES_FLAGS_PRIORITY 0x0002 /** * Maximum number of allowed segments in src/dst fields of a descriptor. */ #define MCA_BTL_DES_MAX_SEGMENTS 16 /* * BTL base header, stores the tag at a minimum */ struct mca_btl_base_header_t{ mca_btl_base_tag_t tag; }; typedef struct mca_btl_base_header_t mca_btl_base_header_t; /* * BTL component interface functions and datatype. */ /** * MCA->BTL Initializes the BTL component and creates specific BTL * module(s). * * @param num_btls (OUT) Returns the number of btl modules created, or 0 * if the transport is not available. * * @param enable_progress_threads (IN) Whether this component is * allowed to run a hidden/progress thread or not. * * @param enable_mpi_threads (IN) Whether support for multiple MPI * threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which * indicates whether multiple threads may invoke this component * simultaneously or not. * * @return Array of pointers to BTL modules, or NULL if the transport * is not available. * * During component initialization, the BTL component should discover * the physical devices that are available for the given transport, * and create a BTL module to represent each device. Any addressing * information required by peers to reach the device should be published * during this function via the mca_base_modex_send() interface. * */ typedef struct mca_btl_base_module_t** (*mca_btl_base_component_init_fn_t)( int *num_btls, bool enable_progress_threads, bool enable_mpi_threads ); /** * MCA->BTL Called to progress outstanding requests for * non-threaded polling environments. * * @param tstamp Current time. * @return OMPI_SUCCESS or error code on failure. */ typedef int (*mca_btl_base_component_progress_fn_t)(void); /** * BTL component descriptor. Contains component version information * and component open/close/init functions. */ struct mca_btl_base_component_1_0_0_t { mca_base_component_t btl_version; mca_base_component_data_1_0_0_t btl_data; mca_btl_base_component_init_fn_t btl_init; mca_btl_base_component_progress_fn_t btl_progress; }; typedef struct mca_btl_base_component_1_0_0_t mca_btl_base_component_1_0_0_t; typedef struct mca_btl_base_component_1_0_0_t mca_btl_base_component_t; /* * BTL module interface functions and datatype. */ /** * MCA->BTL Clean up any resources held by BTL module * before the module is unloaded. * * @param btl (IN) BTL module. * * Prior to unloading a BTL module, the MCA framework will call * the BTL finalize method of the module. Any resources held by * the BTL should be released and if required the memory corresponding * to the BTL module freed. * */ typedef int (*mca_btl_base_module_finalize_fn_t)( struct mca_btl_base_module_t* btl ); /** * PML->BTL notification of change in the process list. * * @param btl (IN) BTL module * @param nprocs (IN) Number of processes * @param procs (IN) Set of processes * @param endpoint (OUT) Set of (optional) mca_btl_base_endpoint_t structures by BTL. * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL. * @return OMPI_SUCCESS or error status on failure. * * The mca_btl_base_module_add_procs_fn_t() is called by the PML to * determine the set of BTLs that should be used to reach each process. * Any addressing information exported by the peer via the mca_base_modex_send() * function should be available during this call via the corresponding * mca_base_modex_recv() function. The BTL may utilize this information to * determine reachability of each peer process. * * For each process that is reachable by the BTL, the bit corresponding to the index * into the proc array (nprocs) should be set in the reachable bitmask. The PML * provides the BTL the option to return a pointer to a data structure defined * by the BTL that is returned to the BTL on subsequent calls to the BTL data * transfer functions (e.g btl_send). This may be used by the BTL to cache any addressing * or connection information (e.g. TCP socket, IP queue pair). */ typedef int (*mca_btl_base_module_add_procs_fn_t)( struct mca_btl_base_module_t* btl, size_t nprocs, struct ompi_proc_t** procs, struct mca_btl_base_endpoint_t** endpoints, struct ompi_bitmap_t* reachable ); /** * Notification of change to the process list. * * @param btl (IN) BTL module * @param nprocs (IN) Number of processes * @param proc (IN) Set of processes * @param peer (IN) Set of peer addressing information. * @return Status indicating if cleanup was successful * * When the process list changes, the PML notifies the BTL of the * change, to provide the opportunity to cleanup or release any * resources associated with the peer. */ typedef int (*mca_btl_base_module_del_procs_fn_t)( struct mca_btl_base_module_t* btl, size_t nprocs, struct ompi_proc_t** procs, struct mca_btl_base_endpoint_t** ); /** * Callback function that is called asynchronously on receipt * of data by the transport layer. */ typedef void (*mca_btl_base_module_recv_cb_fn_t)( struct mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* descriptor, void* cbdata ); /** * Register a callback function that is called on receipt * of a fragment. * * @param btl (IN) BTL module * @return Status indicating if cleanup was successful * * When the process list changes, the PML notifies the BTL of the * change, to provide the opportunity to cleanup or release any * resources associated with the peer. */ typedef int (*mca_btl_base_module_register_fn_t)( struct mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_module_recv_cb_fn_t cbfunc, void* cbdata ); /** * Allocate a descriptor with a segment of the requested size. * Note that the BTL layer may choose to return a smaller size * if it cannot support the request. * * @param btl (IN) BTL module * @param size (IN) Request segment size. */ typedef mca_btl_base_descriptor_t* (*mca_btl_base_module_alloc_fn_t)( struct mca_btl_base_module_t* btl, size_t size ); /** * Return a descriptor allocated from this BTL via alloc/prepare. * * @param btl (IN) BTL module * @param segment (IN) Descriptor allocated from the BTL */ typedef int (*mca_btl_base_module_free_fn_t)( struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* descriptor ); /** * Prepare a descriptor for send/rdma using the supplied * convertor. If the convertor references data that is contigous, * the descriptor may simply point to the user buffer. Otherwise, * this routine is responsible for allocating buffer space and * packing if required. * * @param btl (IN) BTL module * @param endpoint (IN) BTL peer addressing * @param convertor (IN) Data type convertor * @param reserve (IN) Additional bytes requested by upper layer to precede user data * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) */ typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size ); /** * Initiate an asynchronous send. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transfered * @param tag (IN) The tag value used to notify the peer. */ typedef int (*mca_btl_base_module_send_fn_t)( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor, mca_btl_base_tag_t tag ); /** * Initiate an asynchronous put. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred */ typedef int (*mca_btl_base_module_put_fn_t)( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor ); /** * Initiate an asynchronous get. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred * */ typedef int (*mca_btl_base_module_get_fn_t)( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, struct mca_btl_base_descriptor_t* descriptor ); /** * BTL module interface functions and attributes. */ struct mca_btl_base_module_t { /* BTL common attributes */ mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */ size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */ size_t btl_min_send_size; /**< threshold below which the BTL should not fragment */ size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */ size_t btl_min_rdma_size; /**< threshold below which the BTL should not fragment */ size_t btl_max_rdma_size; /**< maximum rdma fragment size supported by the BTL */ uint32_t btl_exclusivity; /**< indicates this BTL should be used exclusively */ uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */ uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */ uint32_t btl_flags; /**< flags (put/get...) */ /* BTL function table */ mca_btl_base_module_add_procs_fn_t btl_add_procs; mca_btl_base_module_del_procs_fn_t btl_del_procs; mca_btl_base_module_register_fn_t btl_register; mca_btl_base_module_finalize_fn_t btl_finalize; mca_btl_base_module_alloc_fn_t btl_alloc; mca_btl_base_module_free_fn_t btl_free; mca_btl_base_module_prepare_fn_t btl_prepare_src; mca_btl_base_module_prepare_fn_t btl_prepare_dst; mca_btl_base_module_send_fn_t btl_send; mca_btl_base_module_put_fn_t btl_put; mca_btl_base_module_get_fn_t btl_get; }; typedef struct mca_btl_base_module_t mca_btl_base_module_t; /* * Macro for use in modules that are of type btl v1.0.0 */ #define MCA_BTL_BASE_VERSION_1_0_0 \ /* coll v1.0 is chained to MCA v1.0 */ \ MCA_BASE_VERSION_1_0_0, \ /* btl v1.0 */ \ "btl", 1, 0, 0 #endif /* OMPI_MCA_BTL_H */