42ec26e640
This commit was SVN r7999.
717 строки
31 KiB
C
717 строки
31 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file
|
|
*
|
|
* P2P Transport Layer (PTL)
|
|
*
|
|
* An MCA component type that allows the PML (mca_pml_t) to support a
|
|
* variety of network transports concurrently. The PTL layer is
|
|
* responsible for the reliable delivery of message fragments, while
|
|
* the assignment and scheduling of fragments to PTLs is handled by
|
|
* the upper layer.
|
|
*
|
|
* PTL Initialization:
|
|
*
|
|
* During library initialization, all available PTL components are
|
|
* loaded and opened via their mca_base_open_component_fn_t
|
|
* function. The PTL open function should register any mca parameters
|
|
* used to tune/adjust the behaviour of the ptl (mca_base_param_register_int(),
|
|
* mca_base_param_register_string()). Note that the open function may fail
|
|
* if the resources (e.g. shared libraries, etc) required by the network
|
|
* transport are not available.
|
|
*
|
|
* The mca_ptl_base_component_init_fn_t() is then called for each of the
|
|
* components that are succesfully opened. The component init function may
|
|
* return either:
|
|
*
|
|
* (1) a NULL list of PTL instances if the transport is not available,
|
|
* (2) a list containing a single PTL instance, where the PTL provides
|
|
* a layer of abstraction over multiple physical devices (e.g. NICs),
|
|
* (3) a list containing multiple PTL instances where each PTL instance
|
|
* corresponds to a single physical device.
|
|
*
|
|
* If multiple network devices are available for a given transport,
|
|
* the preferred approach is (3) above. In this case, the PML layer
|
|
* will handle scheduling across the available resources, and
|
|
* fail-over in the event of a PTL failure. If the second approach is
|
|
* used, and a single PTL instance abstracts multiple physical
|
|
* devices, the PTL assumes all responsibility for scheduling/failover
|
|
* within those devices.
|
|
*
|
|
* During module initialization, the module should post any addressing
|
|
* information required by its peers. An example would be the TCP
|
|
* listen port opened by the TCP module for incoming connection
|
|
* requests. This information is published to peers via the
|
|
* mca_pml_base_modex_send() interface. Note that peer information is not
|
|
* guaranteed to be available via mca_pml_base_modex_recv() during the
|
|
* module's init function. However, it will be available during
|
|
* PTL selection (mca_ptl_base_add_proc_fn_t()).
|
|
*
|
|
* PTL Selection:
|
|
*
|
|
* The PML builds an ordered list of the available PTL instances sorted
|
|
* by their exclusivity ranking. This is a relative ranking that is used
|
|
* to determine the set of PTLs that may be used to reach a given destination.
|
|
* During startup the PTL modules are queried via their
|
|
* mca_ptl_base_add_proc_fn_t() to determine if they are able to reach
|
|
* a given destination. The PTL module with the highest ranking that
|
|
* returns success is selected. Subsequent PTL modules are selected only
|
|
* if they have the same exclusivity ranking.
|
|
*
|
|
* An example of how this might be used:
|
|
*
|
|
* PTL Exclusivity Comments
|
|
* -------- ----------- ------------------
|
|
* LO 100 Selected exclusively for local process
|
|
* SM 50 Selected exclusively for other processes on host
|
|
* IB 0 Selected based on network reachability
|
|
* IB 0 Selected based on network reachability
|
|
* TCP 0 Selected based on network reachability
|
|
* TCP 0 Selected based on network reachability
|
|
*
|
|
* When a PTL module is selected, it may choose to optionally return a
|
|
* pointer to an an mca_ptl_base_peer_t data structure to the PML.
|
|
* This pointer is treated as an opaque handle by the PML and is
|
|
* returned to the PTL on subsequent data transfer calls to the
|
|
* corresponding destination process. The actual contents of the
|
|
* data structure are defined on a per PTL basis, and may be used to
|
|
* cache addressing or connection information, such as a TCP socket
|
|
* or IB queue pair.
|
|
*
|
|
* Send Path:
|
|
*
|
|
* When multiple PTLs are available to reach a given destination,
|
|
* a single request (that is large enough) will be split across the
|
|
* available PTLs. For each destination process, the PML maintains two
|
|
* list of PTLs, one set of PTLs that exhibit the lowest latency, and
|
|
* a second set that are used for bulk data transfer. The set of low
|
|
* latency PTLs are used in a round-robin fashion to schedule the first
|
|
* fragment of a message, while the remainder of the message will be
|
|
* scheduled across the second set based on the bandwidth of the available
|
|
* PTLs.
|
|
*
|
|
* The PML is responsible for managing the state (allocation, initialization,
|
|
* and release) of send request descriptors (mca_ptl_base_send_request_t).
|
|
* However, to minimize the latency associated with allocating resources to
|
|
* a request, the PML provides the capability to cache send requests
|
|
* descriptors on a per-PTL basis. Each PTL exports two variables
|
|
* (ptl_cache_size and ptl_cache_bytes) that control this behaviour. The
|
|
* variable ptl_cache_size specifies the maximum size of the cache. If a
|
|
* request cannot be provided from the cache, a request descriptor from the
|
|
* global pool will be used instead, and the req_cached attribute of the
|
|
* request set to false. The request cache initially starts off empty and
|
|
* is grown by the PML up to the specified limit. The PTL variable,
|
|
* ptl_cache_bytes, can be used to specify that additional memory should be
|
|
* allocated by the PML in one contigous block along with the base send request
|
|
* (mca_ptl_base_send_request_t) for use by the PTL. The PTLs ptl_request_init()
|
|
* method is then called to initialize this additional memory and associating
|
|
* any PTL specific resources with the request. If a request is removed from
|
|
* the cache, the ptl_request_fini() method will be called to allow the PTL
|
|
* to release any resources associated with the request descriptor.
|
|
*
|
|
* When the request is started, the PML will call the selected PTL's
|
|
* ptl_send() method with up to the PTL's threshold (ptl_first_frag_size)
|
|
* bytes of the request. The PTL should attempt to deliver the requested
|
|
* number of bytes. However, this may not be possible due to resource
|
|
* contraints or datatype alignment/offset. The PTL is responsible for
|
|
* updating the number of bytes actually fragmented and queued for delivery
|
|
* on the send request (mca_ptl_base_send_request.req_offset) to reflect
|
|
* the current offset into the send buffer.
|
|
*
|
|
* If the request is larger than ptl_first_frag_size, the remainder of
|
|
* the request will be scheduled upon an acknowledgment from the peer
|
|
* that the request has been matched on the receive side. The PTL receiving
|
|
* the fragment is responsible for generating an acknowledgment when the
|
|
* MCA_PTL_FLAGS_ACK bit is set in the flags field of the fragment
|
|
* header. The PTL receiving an ack is responsible for updating the
|
|
* the send request descriptor to point to the matched recv descriptor
|
|
* and the destination buffer address at the remote process. The address of
|
|
* the recv descriptor is sent back in the header of subsequent fragments
|
|
* to avoid the cost of matching the additional fragments at the receiver
|
|
* while the remote address of the destination buffer may be used in
|
|
* subsequent data transfer operations to support RDMA put operations.
|
|
*
|
|
* On receipt of an acknowedgment the PTL should call the ptl_send_progress()
|
|
* function to update the status (number of bytes delivered) of the send request.
|
|
* Note that although this function is associated with the PTL, it is provided/
|
|
* set by the PML during initialization.
|
|
*
|
|
* If this was the initial fragment of a large message, the PML will schedule
|
|
* the remaining fragments during this callback. For subsequent fragments
|
|
* the PML will call the PTLs ptl_put() interface function. Since the destination
|
|
* address in the remote process is available, RDMA put operations could be used
|
|
* if supported by the underlying network transport. Note that currently the PML
|
|
* makes no other distinction between ptl_send/ptl_put.
|
|
*
|
|
* As subsequent fragments are completed by the PTLs, the ptl_send_progress()
|
|
* function should be called to update the status of request. Note that this
|
|
* may be based on local completion semantics or could require a PTL specific
|
|
* acknowledgment based on the underlying transfer protocol. Upon completion,
|
|
* the PTL is responsible for managing all resources associated with send
|
|
* fragments and their return to internal caches/free lists.
|
|
*
|
|
* Recv Path:
|
|
*
|
|
* The PML sets two additional callback functions on the PTL during
|
|
* initialization. These callbacks are used by the PTL to notify the
|
|
* PML of receipt of the initial fragment of a new message (ptl_match)
|
|
* and to update the status of a pending receive as fragment(s) complete
|
|
* (ptl_recv_progress).
|
|
*
|
|
* The first fragment of a message is sent with a header type of
|
|
* mca_ptl_base_match_header_t. When a header of this type is received,
|
|
* the PTL should call the ptl_match() function as soon as the entire header
|
|
* is available, to determine if a matching receive has been posted. When a
|
|
* matching receive is posted the PTLs ptl_matched() function is called to
|
|
* process the fragment and if required generate an acknowledgment. Note that
|
|
* this call (ptl_matched()) may occur during the call to ptl_match() or at
|
|
* a later point in time if a matching recv has not yet been posted or MPI
|
|
* ordering constraints are not satisfied.
|
|
*
|
|
* Prior to calling ptl_matched(), the PML updates the recv fragment descriptor
|
|
* (mca_ptl_recv_frag_t) to point to the matching recv request. If the data
|
|
* associated with the fragment has been received prior to the ptl_matched()
|
|
* call, the PTL should utilize the datatype convertor associated with the
|
|
* recv fragment to copy the data into the users buffer. Note that the datatype
|
|
* convertor provides the capability to unpack the fragment at an arbitrary
|
|
* (e.g. fragment based) offset into the destination buffer. On completion of
|
|
* the data copy, the PTL should call the ptl_recv_progress() function, to update
|
|
* the request completion status.
|
|
*
|
|
* If the initial fragment is matched prior to receiving any data associated
|
|
* with the fragment, or in the case of subsequent fragments, the datatype
|
|
* convertor may be used to generate an iovec array of contiguous blocks
|
|
* pointing into the destination buffer, which can be used for zero-copy
|
|
* receives if the underlying transport supports scatter/gather operations.
|
|
*
|
|
* The ptl_matched() function should additionally generate, if required, an
|
|
* ack to the source process. An ack is required if the MCA_PTL_FLAGS_ACK
|
|
* bit is set by the source in the flags field of the initial message header.
|
|
* As described above, the generated ack should contain a pointer to the matched
|
|
* receive request, along with the pointer to the destination buffer.
|
|
*
|
|
* On receipt of the ack, the source will schedule any remaining fragments.
|
|
* The selected PTLs should generate the remaining fragments with an
|
|
* mca_ptl_base_frag_header_t, which contains a placeholder for a pointer
|
|
* to the matched receive request. This allows the receiver to avoid calling the
|
|
* matching logic for subsequent fragments. On completion of these fragments,
|
|
* the PTL should call the ptl_recv_progress() function to update the
|
|
* request completion status. As fragments are completed, the PTL is responsible
|
|
* for freeing any resources associated with recv fragment descriptors and/or
|
|
* returning them to internal free lists/caches.
|
|
*
|
|
* Progress:
|
|
*
|
|
* By default, the library provides for polling based progress of outstanding
|
|
* requests. The PTL component exports an interface function (ptlm_progress)
|
|
* that is called in a polling mode by the PML during calls into the MPI
|
|
* library. Note that the ptlm_progress() function is called on the PTL component
|
|
* rather than each PTL instance. This implies that the PTL author is responsible
|
|
* for iterating over the pending operations in each of the PTL modules associated
|
|
* with the component.
|
|
*
|
|
* On platforms where threading support is provided, the library provides the
|
|
* option of building with asynchronous threaded progress. In this case, the PTL
|
|
* author is responsible for providing a thread to progress pending operations.
|
|
* A thread is associated with the PTL component/module such that transport specific
|
|
* functionality/APIs may be used to block the thread until a pending operation
|
|
* completes. This thread MUST NOT poll for completion as this would oversubscribe
|
|
* the CPU.
|
|
*
|
|
* Note that in the threaded case the PML may choose to use a hybrid approach,
|
|
* such that polling is implemented from the user thread for a fixed number of
|
|
* cycles before relying on the background thread(s) to complete requests. If
|
|
* possible the PTL should support the use of both modes concurrently.
|
|
*
|
|
*/
|
|
|
|
/* Thses are unprotected because if the pml is direct called, pml.h
|
|
has a dependencies on ptl.h and must have ptl.h fully included
|
|
before pml.h is parsed. It's weird, but there isn't a better way
|
|
without doing some strange type forward declarations. */
|
|
#include "mca/mca.h"
|
|
#include "mca/pml/pml.h"
|
|
|
|
#ifndef MCA_PTL_H
|
|
#define MCA_PTL_H
|
|
|
|
#include "include/types.h"
|
|
|
|
/*
|
|
* PTL types
|
|
*/
|
|
|
|
struct mca_ptl_base_module_t;
|
|
struct mca_ptl_base_peer_t;
|
|
struct mca_ptl_base_fragment_t;
|
|
struct mca_ptl_base_recv_request_t;
|
|
struct mca_ptl_base_send_request_t;
|
|
struct mca_ptl_base_recv_frag_t;
|
|
struct mca_ptl_base_send_frag_t;
|
|
struct mca_ptl_base_match_header_t;
|
|
|
|
typedef uint64_t mca_ptl_sequence_t;
|
|
typedef uint64_t mca_ptl_tstamp_t;
|
|
typedef struct opal_list_t mca_ptl_queue_t;
|
|
|
|
typedef enum {
|
|
MCA_PTL_ENABLE
|
|
} mca_ptl_control_t;
|
|
|
|
/**
|
|
* PTL flags
|
|
*/
|
|
#define MCA_PTL_PUT 1
|
|
#define MCA_PTL_GET 2
|
|
|
|
/*
|
|
* PTL component interface functions and datatype.
|
|
*/
|
|
|
|
/**
|
|
* MCA->PTL Intializes the PTL component and creates specific PTL
|
|
* module(s).
|
|
*
|
|
* @param num_ptls (OUT) Returns the number of ptl instances created, or 0
|
|
* if the transport is not available.
|
|
*
|
|
* @param enable_progress_threads (IN) Whether this component is
|
|
* allowed to run a hidden/progress thread or not.
|
|
*
|
|
* @param enable_mpi_threads (IN) Whether support for multiple MPI
|
|
* threads is enabled or not (i.e., MPI_THREAD_MULTIPLE), which
|
|
* indicates whether multiple threads may invoke this component
|
|
* simultaneously or not.
|
|
*
|
|
* @return Array of pointers to PTL modules, or NULL if the transport
|
|
* is not available.
|
|
*
|
|
* During component initialization, the PTL component should discover
|
|
* the physical devices that are available for the given transport,
|
|
* and create a PTL instance to represent each device. Any addressing
|
|
* information required by peers to reach the device should be published
|
|
* during this function via the mca_pml_base_modex_send() interface.
|
|
*
|
|
*/
|
|
typedef struct mca_ptl_base_module_t** (*mca_ptl_base_component_init_fn_t)(
|
|
int *num_ptls,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads
|
|
);
|
|
|
|
|
|
/**
|
|
* MCA->PTL Called to dynamically change a component parameter.
|
|
*
|
|
* @param flag (IN) Parameter to change.
|
|
* @param value (IN) Optional parameter value.
|
|
*
|
|
* @return OMPI_SUCCESS or error code on failure.
|
|
*
|
|
* The only supported parameter is currently MCA_PTL_ENABLE,
|
|
* which can be used by the PML to enable/disable forwarding
|
|
* by the PTL.
|
|
*/
|
|
typedef int (*mca_ptl_base_component_control_fn_t)(
|
|
int param,
|
|
void* value,
|
|
size_t size
|
|
);
|
|
|
|
|
|
/**
|
|
* MCA->PTL Called to progress outstanding requests for
|
|
* non-threaded polling environments.
|
|
*
|
|
* @param tstamp Current time.
|
|
* @return OMPI_SUCCESS or error code on failure.
|
|
*/
|
|
typedef int (*mca_ptl_base_component_progress_fn_t)(
|
|
mca_ptl_tstamp_t tstamp
|
|
);
|
|
|
|
|
|
/**
|
|
* PTL component descriptor. Contains component version information
|
|
* and component open/close/init functions.
|
|
*/
|
|
|
|
struct mca_ptl_base_component_1_0_0_t {
|
|
mca_base_component_t ptlm_version;
|
|
mca_base_component_data_1_0_0_t ptlm_data;
|
|
|
|
mca_ptl_base_component_init_fn_t ptlm_init;
|
|
mca_ptl_base_component_control_fn_t ptlm_control;
|
|
mca_ptl_base_component_progress_fn_t ptlm_progress;
|
|
};
|
|
typedef struct mca_ptl_base_component_1_0_0_t mca_ptl_base_component_1_0_0_t;
|
|
typedef struct mca_ptl_base_component_1_0_0_t mca_ptl_base_component_t;
|
|
|
|
|
|
/*
|
|
* PTL instance interface functions and datatype.
|
|
*/
|
|
|
|
|
|
/**
|
|
* MCA->PTL Clean up any resources held by PTL instance before the
|
|
* module is unloaded.
|
|
*
|
|
* @param ptl (IN) PTL instance.
|
|
*
|
|
* Prior to unloading a PTL module, the MCA framework will call
|
|
* the PTL finalize method of the module. Any resources held by
|
|
* the PTL should be released and if required the memory corresponding
|
|
* to the PTL module freed.
|
|
*
|
|
*/
|
|
typedef int (*mca_ptl_base_module_finalize_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl
|
|
);
|
|
|
|
/**
|
|
* PML->PTL notification of change in the process list.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param nprocs (IN) Number of processes
|
|
* @param procs (IN) Set of processes
|
|
* @param peer (OUT) Set of (optional) mca_ptl_base_peer_t instances returned by PTL.
|
|
* @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this PTL.
|
|
* @return OMPI_SUCCESS or error status on failure.
|
|
*
|
|
* The mca_ptl_base_module_add_procs_fn_t() is called by the PML to
|
|
* determine the set of PTLs that should be used to reach each process.
|
|
* Any addressing information exported by the peer via the mca_pml_base_modex_send()
|
|
* function should be available during this call via the corresponding
|
|
* mca_pml_base_modex_recv() function. The PTL may utilize this information to
|
|
* determine reachability of each peer process.
|
|
*
|
|
* For each process that is reachable by the PTL, the bit corresponding to the index
|
|
* into the proc array (nprocs) should be set in the reachable bitmask. The PML
|
|
* provides the PTL the option to return a pointer to a data structure defined
|
|
* by the PTL that is returned to the PTL on subsequent calls to the PTL data
|
|
* transfer functions (e.g ptl_send). This may be used by the PTL to cache any addressing
|
|
* or connection information (e.g. TCP socket, IP queue pair).
|
|
*/
|
|
typedef int (*mca_ptl_base_module_add_procs_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs,
|
|
struct mca_ptl_base_peer_t** peer,
|
|
struct ompi_bitmap_t* reachable
|
|
);
|
|
|
|
/**
|
|
* PML->PTL notification of change to the process list.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param nprocs (IN) Number of processes
|
|
* @param proc (IN) Set of processes
|
|
* @param peer (IN) Set of peer addressing information.
|
|
* @return Status indicating if cleanup was successful
|
|
*
|
|
* When the process list changes, the PML notifies the PTL of the
|
|
* change, to provide the opportunity to cleanup or release any
|
|
* resources associated with the peer.
|
|
*/
|
|
typedef int (*mca_ptl_base_module_del_procs_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs,
|
|
struct mca_ptl_base_peer_t**
|
|
);
|
|
|
|
/**
|
|
* PML->PTL Initialize a send request for use by the PTL.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param request (IN) Pointer to allocated request.
|
|
*
|
|
* To reduce latency (number of required allocations), the PML allocates up
|
|
* to ptl_cache_bytes of additional space contigous w/ the base send request.
|
|
* This space may be used by the PTL for additional control information (e.g.
|
|
* first fragment descriptor).
|
|
*
|
|
* The ptl_request_init() function is called by the PML when requests are
|
|
* allocated to the PTLs cache. These requests will be cached by the PML
|
|
* on completion and re-used by the same PTL w/out additional calls to
|
|
* ptl_request_init().
|
|
*
|
|
* If the cache size is exceeded, the PML may pass requests to ptl_send/ptl_put
|
|
* that have been taken from the global pool and have not been initialized by the
|
|
* PTL. These requests will have the req_cached attribute set to false.
|
|
*
|
|
*/
|
|
typedef int (*mca_ptl_base_module_request_init_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_send_request_t* request
|
|
);
|
|
|
|
|
|
/**
|
|
* PML->PTL Cleanup any resources that may have been associated with the
|
|
* request by the PTL.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param request (IN) Pointer to allocated request.
|
|
*
|
|
* The ptl_request_fini function is called when the PML removes a request
|
|
* from the PTLs cache (due to resource constraints). This routine provides
|
|
* the PTL the chance to cleanup/release any resources cached on the send
|
|
* descriptor by the PTL.
|
|
*/
|
|
|
|
typedef void (*mca_ptl_base_module_request_fini_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_send_request_t* request
|
|
);
|
|
|
|
/**
|
|
* PML->PTL Initiate a send to the peer.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param ptl_base_peer (IN) PTL peer addressing
|
|
* @param request (IN) Send request
|
|
* @param offset Current offset into packed/contiguous buffer.
|
|
* @param size (IN) Number of bytes PML is requesting PTL to deliver,
|
|
* @param flags (IN) Flags that should be passed to the peer via the message header.
|
|
* @param request (OUT) OMPI_SUCCESS if the PTL was able to queue one or more fragments
|
|
*
|
|
* The PML implements a rendevouz protocol, with up to the PTL threshold
|
|
* (ptl_first_frag_size) bytes of the message sent in eager send mode. The ptl_send()
|
|
* function is called by the PML to initiate the send of the first message fragment.
|
|
*
|
|
* The PTL is responsible for updating the current data offset (req_offset) in the
|
|
* request to reflect the actual number of bytes fragmented. This may be less than
|
|
* the requested size, due to resource constraints or datatype alighnment/offset. If
|
|
* an acknowledgment is required, the MCA_PTL_FLAGS_ACK bit will be set in the
|
|
* flags parameter. In this case, the PTL should not call ptl_send_progress() function
|
|
* to indicate completion of the fragment until the ack is received. For all other
|
|
* fragments ptl_send_progress() may be called based on local completion semantics.
|
|
*/
|
|
typedef int (*mca_ptl_base_module_send_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_peer_t* ptl_base_peer,
|
|
struct mca_ptl_base_send_request_t* request,
|
|
size_t offset,
|
|
size_t size,
|
|
int flags
|
|
);
|
|
|
|
/**
|
|
* PML->PTL Initiate a put to the peer.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param ptl_base_peer (IN) PTL peer addressing
|
|
* @param request (IN) Send request
|
|
* @param offset Current offset into packed/contiguous buffer.
|
|
* @param size (IN/OUT) Number of bytes PML is requesting PTL to deliver,
|
|
* PTL returns number of bytes sucessfully fragmented
|
|
* @param flags (IN) Flags that should be passed to the peer via the message header.
|
|
* @param request (OUT) OMPI_SUCCESS if the PTL was able to queue one or more fragments
|
|
*
|
|
* When the message exceeds the PTLs initial fragment size (ptl_first_frag_size),
|
|
* the PML schedules the remainder of the message after an ack is received for
|
|
* the first fragment. When the remaining fragments are scheduled the PML calls the
|
|
* the ptl_put() I/F function rather than ptl_send(), to indicate that the address of
|
|
* the destination buffer at the remote process is available, allowing for an RDMA put
|
|
* if supported by the underlying transport.
|
|
*
|
|
* The PTL is responsible for updating the current data offset (req_offset) in the
|
|
* request to reflect the actual number of bytes fragmented. This may be less than
|
|
* the requested size, due to resource constraints or datatype alighnment/offset.
|
|
* The PTL must call the ptl_send_progress() function to indicate completion of each
|
|
* fragment.
|
|
*/
|
|
|
|
typedef int (*mca_ptl_base_module_put_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_peer_t* ptl_base_peer,
|
|
struct mca_ptl_base_send_request_t* request,
|
|
size_t offset,
|
|
size_t size,
|
|
int flags
|
|
);
|
|
|
|
|
|
/**
|
|
* PML->PTL Initiate a get from a peer. (NOT IMPLEMENTED)
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param ptl_base_peer (IN) PTL peer addressing
|
|
* @param request (IN) Recv request
|
|
* @param offset Current offset into packed/contiguous buffer.
|
|
* @param size (IN/OUT) Number of bytes PML is requesting PTL to pull from peer,
|
|
* PTL returns number of bytes sucessfully fragmented.
|
|
* @param flags (IN)
|
|
* @param request (OUT) OMPI_SUCCESS if the PTL was able to queue one or more fragments
|
|
*
|
|
* The PML does NOT currently utilize this I/F.
|
|
*/
|
|
|
|
typedef int (*mca_ptl_base_module_get_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_peer_t* ptl_base_peer,
|
|
struct mca_ptl_base_recv_request_t* request,
|
|
size_t offset,
|
|
size_t size,
|
|
int flags
|
|
);
|
|
|
|
/**
|
|
* PTL->PML Notification from the PTL to the PML that a new fragment
|
|
* has arrived and can be matched against posted receives.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param recv_frag Receive fragment
|
|
* @param header (IN) Message header
|
|
*
|
|
* The ptl_match() function is called by the PTL on receipt of an
|
|
* initial fragment of a new message. The PML sets a default
|
|
* matching function on the PTL (ptl_match) when the PTL is initialized.
|
|
* This function attempts to match the header corresponding to the
|
|
* receive fragment to posted receives. When a match is made, the
|
|
* PTLs ptl_matched() function is called. Note that this may occur
|
|
* during the call to ptl_match(), or later in time if a matching receive
|
|
* has not yet been posted or the receive fragment is out-of-order.
|
|
*/
|
|
typedef bool (*mca_ptl_base_module_match_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_recv_frag_t* recv_frag,
|
|
struct mca_ptl_base_match_header_t* header
|
|
);
|
|
|
|
|
|
/**
|
|
* PML->PTL Notification from the PML to the PTL that a receive
|
|
* has been posted and matched against the indicated fragment.
|
|
*
|
|
* @param ptl (IN) PTL instance
|
|
* @param recv_frag Matched fragment
|
|
*
|
|
* The ptl_matched() function is called by the PML when a fragment
|
|
* is matched to a posted receive. This may occur during a call to
|
|
* ptl_match() if the receive is matched, or at a later point in time
|
|
* when a matching receive is posted.
|
|
*
|
|
* When this routine is called, the PTL is responsible for generating
|
|
* an acknowledgment to the peer if the MCA_PTL_FLAGS_ACK
|
|
* bit is set in the original fragment header. Additionally, the PTL
|
|
* is responsible for transferring any data associated with the fragment
|
|
* into the users buffer utilizing the datatype engine, and notifying
|
|
* the PML that the fragment has completed via the ptl_recv_progress()
|
|
* function.
|
|
*/
|
|
|
|
typedef void (*mca_ptl_base_module_matched_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_recv_frag_t* request
|
|
);
|
|
|
|
/**
|
|
* PTL->PML Notification from the PTL to the PML that a fragment
|
|
* has completed (e.g. been successfully delivered into users buffer)
|
|
*
|
|
* @param ptr(IN) PTL instance
|
|
* @param recv_request (IN) Receive Request
|
|
* @param bytes_received (IN) Number of bytes received from peer.
|
|
* @param bytes_delivered (IN) Number of bytes delivered to application.
|
|
*
|
|
* The PML sets this function pointer during module initialization
|
|
* to allow the PTL to make upcalls back into the PML as fragments
|
|
* complete.
|
|
*/
|
|
typedef void (*mca_ptl_base_module_recv_progress_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_recv_request_t* recv_request,
|
|
size_t bytes_received,
|
|
size_t bytes_delivered
|
|
);
|
|
|
|
/**
|
|
* PTL->PML Notification from the PTL to the PML that a fragment
|
|
* has completed (e.g. been successfully delivered to peer)
|
|
*
|
|
* @param ptr(IN) PTL instance
|
|
* @param send_request (IN) Send Request
|
|
* @param bytes_sent (IN) Number of bytes sent to peer.
|
|
*
|
|
* The PML sets this function pointer during module initialization
|
|
* to allow the PTL to make upcalls back into the PML as fragments
|
|
* complete.
|
|
*/
|
|
|
|
typedef void (*mca_ptl_base_module_send_progress_fn_t)(
|
|
struct mca_ptl_base_module_t* ptl,
|
|
struct mca_ptl_base_send_request_t* send_request,
|
|
size_t bytes_sent
|
|
);
|
|
|
|
/**
|
|
* PTL instance interface functions and attributes.
|
|
*/
|
|
struct mca_ptl_base_module_t {
|
|
|
|
/* PTL common attributes */
|
|
mca_ptl_base_component_t* ptl_component; /**< pointer back to the PTL component structure */
|
|
size_t ptl_cache_size; /**< maximum size of request cache for this PTL */
|
|
size_t ptl_cache_bytes; /**< number of bytes required by PTL for request cache */
|
|
size_t ptl_first_frag_size; /**< maximum size of first fragment -- eager send */
|
|
size_t ptl_min_frag_size; /**< threshold below which the PTL will not fragment */
|
|
size_t ptl_max_frag_size; /**< maximum fragment size supported by the PTL */
|
|
uint32_t ptl_exclusivity; /**< indicates this PTL should be used exclusively */
|
|
uint32_t ptl_latency; /**< relative ranking of latency used to prioritize ptls */
|
|
uint32_t ptl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
|
|
uint32_t ptl_flags; /**< flags (put/get...) */
|
|
|
|
/* PML->PTL function table */
|
|
mca_ptl_base_module_add_procs_fn_t ptl_add_procs;
|
|
mca_ptl_base_module_del_procs_fn_t ptl_del_procs;
|
|
mca_ptl_base_module_finalize_fn_t ptl_finalize;
|
|
mca_ptl_base_module_send_fn_t ptl_send;
|
|
mca_ptl_base_module_put_fn_t ptl_put;
|
|
mca_ptl_base_module_get_fn_t ptl_get;
|
|
mca_ptl_base_module_matched_fn_t ptl_matched;
|
|
mca_ptl_base_module_request_init_fn_t ptl_request_init;
|
|
mca_ptl_base_module_request_fini_fn_t ptl_request_fini;
|
|
|
|
/* PTL->PML function table - filled in by PML during module init */
|
|
mca_ptl_base_module_match_fn_t ptl_match;
|
|
mca_ptl_base_module_send_progress_fn_t ptl_send_progress;
|
|
mca_ptl_base_module_recv_progress_fn_t ptl_recv_progress;
|
|
|
|
/* Allow the canibalization of the PTL */
|
|
struct mca_ptl_base_module_t* ptl_stack;
|
|
|
|
/* for use by PML only */
|
|
struct mca_pml_base_ptl_t* ptl_base;
|
|
};
|
|
typedef struct mca_ptl_base_module_t mca_ptl_base_module_t;
|
|
|
|
/*
|
|
* Macro for use in modules that are of type ptl v1.0.0
|
|
*/
|
|
#define MCA_PTL_BASE_VERSION_1_0_0 \
|
|
/* coll v1.0 is chained to MCA v1.0 */ \
|
|
MCA_BASE_VERSION_1_0_0, \
|
|
/* ptl v1.0 */ \
|
|
"ptl", 1, 0, 0
|
|
|
|
#endif /* OMPI_MCA_PTL_H */
|