6c5532072a
OMPI and a language agnostic part in OPAL. The convertor is completely moved into OPAL. This offers several benefits as described in RFC http://www.open-mpi.org/community/lists/devel/2009/07/6387.php namely: - Fewer basic types (int* and float* types, boolean and wchar - Fixing naming scheme to ompi-nomenclature. - Usability outside of the ompi-layer. - Due to the fixed nature of simple opal types, their information is completely known at compile time and therefore constified - With fewer datatypes (22), the actual sizes of bit-field types may be reduced from 64 to 32 bits, allowing reorganizing the opal_datatype structure, eliminating holes and keeping data required in convertor (upon send/recv) in one cacheline... This has implications to the convertor-datastructure and other parts of the code. - Several performance tests have been run, the netpipe latency does not change with this patch on Linux/x86-64 on the smoky cluster. - Extensive tests have been done to verify correctness (no new regressions) using: 1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and ompi-ddt: a. running both trunk and ompi-ddt resulted in no differences (except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run correctly). b. with --enable-memchecker and running under valgrind (one buglet when run with static found in test-suite, commited) 2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt: all passed (except for the dynamic/ tests failed!! as trunk/MTT) 3. compilation and usage of HDF5 tests on Jaguar using PGI and PathScale compilers. 4. compilation and usage on Scicortex. - Please note, that for the heterogeneous case, (-m32 compiled binaries/ompi), neither ompi-trunk, nor ompi-ddt branch would successfully launch. This commit was SVN r21641.
412 строки
16 KiB
C
412 строки
16 KiB
C
/*
|
|
* Copyright (c) 2004-2006 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
*
|
|
* Matching Transport Layer
|
|
*
|
|
* The Matching Transport Layer (MTL) provides device-layer support
|
|
* for transfer of MPI point-to-point messages over devices that
|
|
* support hardware / library message matching. This layer is used
|
|
* with the MTL PML component to provide lowest latency and highest
|
|
* bandwidth on given architectures. Features found in other PML
|
|
* interfaces, such as message fragmenting, multi-device support, and
|
|
* NIC failover are not provided by the upper layers.
|
|
*
|
|
* In general, this interface should not be used for transport layer
|
|
* support. Instead, the BTL interface should be used. The BTL
|
|
* interface allows for multiplexing between multiple users
|
|
* (point-to-point, one-sided, etc.) and provides many features not
|
|
* found in this interface (RDMA from arbitrary buffers, active
|
|
* messaging, reasonable pinned memory caching, etc.)
|
|
*/
|
|
|
|
#ifndef OMPI_MTL_H
|
|
#define OMPI_MTL_H
|
|
|
|
#include "ompi_config.h"
|
|
#include "mpi.h" /* needed for MPI_ANY_TAG */
|
|
#include "opal/mca/mca.h"
|
|
#include "ompi/mca/pml/pml.h" /* for send_mode enum */
|
|
#include "ompi/request/request.h"
|
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
extern "C" {
|
|
#endif
|
|
|
|
struct ompi_request_t;
|
|
struct opal_convertor_t;
|
|
|
|
struct mca_mtl_base_module_t;
|
|
|
|
struct mca_mtl_base_endpoint_t;
|
|
|
|
struct mca_mtl_request_t {
|
|
/** pointer to associated ompi_request_t */
|
|
struct ompi_request_t *ompi_req;
|
|
void (*completion_callback)(struct mca_mtl_request_t* mtl_request);
|
|
};
|
|
typedef struct mca_mtl_request_t mca_mtl_request_t;
|
|
|
|
/**
|
|
* Initialization routine for MTL component
|
|
*
|
|
* Initialization routine for MTL component. This function should
|
|
* allocate resources for communication and try to do all local setup.
|
|
* It should not attempt to contract it's peers, as that should be
|
|
* done at add_procs time. Contact information should be published
|
|
* during this initialization function. It will be made available
|
|
* during add_procs().
|
|
*
|
|
* @param enable_progress_threads (IN) Progress threads have been
|
|
* enabled by the user and the component must be
|
|
* capable of making asycnhronous progress (either
|
|
* with its own thread, with the kernel, or with
|
|
* the event library.
|
|
* @param enable_mpi_threads (IN) MPI threads have been enabled by the
|
|
* user and the component must be capable of coping
|
|
* with threads. If the component can cope with
|
|
* MPI_THREAD_MULTIPLE, enable_mpi_thread_multiple
|
|
* should be set to true. Otherwise, it is assumed
|
|
* that only THREAD_FUNNELLED and THREAD_SERIALIZED
|
|
* can be used.
|
|
* @param enable_mpi_thread_multiple (OUT) Component does / does not
|
|
* support MPI_THREAD_MULTIPLE. This variable only
|
|
* needs to be set if enable_mpi_threads is true.
|
|
* Otherwise, the return value will be ignored.
|
|
*
|
|
* @retval NULL component can not operate on the current machine
|
|
* @retval non-NULL component interface function
|
|
*/
|
|
typedef struct mca_mtl_base_module_t*
|
|
(*mca_mtl_base_component_init_fn_t)(bool enable_progress_threads,
|
|
bool enable_mpi_threads);
|
|
|
|
|
|
struct mca_mtl_base_component_2_0_0_t {
|
|
mca_base_component_t mtl_version;
|
|
mca_base_component_data_t mtl_data;
|
|
mca_mtl_base_component_init_fn_t mtl_init;
|
|
};
|
|
typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_2_0_0_t;
|
|
typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_t;
|
|
|
|
|
|
/**
|
|
* MCA->MTL Clean up any resources held by MTL module
|
|
*
|
|
* Opposite of module_init. Called when communication will no longer
|
|
* be necessary. ussually this is during MPI_FINALIZE, but it can be
|
|
* earlier if the component was not selected to run. Assuming
|
|
* module_init was called, finalize will always be called before the
|
|
* component_close function is called.
|
|
*
|
|
* @param mtl (IN) MTL module returned from call to initialize
|
|
*
|
|
* @retval OMPI_SUCCESS cleanup finished successfully
|
|
* @retval other failure during cleanup
|
|
*
|
|
*/
|
|
typedef int (*mca_mtl_base_module_finalize_fn_t)(struct mca_mtl_base_module_t* mtl);
|
|
|
|
|
|
/**
|
|
* PML->MTL notification of change in the process list.
|
|
*
|
|
* The mca_mtl_base_module_add_procs_fn_t() is used by the PML to
|
|
* notify the MTL that new processes are connected to the current
|
|
* process. Any addressing information exported by the peer via the
|
|
* ompi_modex_send() function should be available during this
|
|
* call via the corresponding ompi_modex_recv() function. The
|
|
* MTL may utilize this information to determine reachability of each
|
|
* peer process.
|
|
*
|
|
* It is an error for a proc to not be reachable by the given MTL, and
|
|
* an error should be returned if that case is detected. The PML
|
|
* provides the MTL the option to return a pointer to a data structure
|
|
* defined by the MTL that is passed in with all communication
|
|
* functions. The array of procinfo pointers will be allocated by the
|
|
* PML, but it is up to the MTL module to create the memory for the
|
|
* procinfo structure itself. The procinfo structure is opaque to the
|
|
* PML and is only used internally by the MTL.
|
|
*
|
|
* @param mtl (IN) MTL module
|
|
* @param nprocs (IN) Number of processes
|
|
* @param procs (IN) Set of processes
|
|
* @param endpoint (OUT) Array of (optional) mca_mtl_base_procinfo_t
|
|
* structures, one per proc in procs
|
|
*
|
|
* @retval OMPI_SUCCESS successfully connected to processes
|
|
* @retval other failure during setup
|
|
*/
|
|
typedef int (*mca_mtl_base_module_add_procs_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs,
|
|
struct mca_mtl_base_endpoint_t **mtl_peer_data);
|
|
|
|
|
|
/**
|
|
* Notification of change to the process list.
|
|
*
|
|
* When the process list changes, the PML notifies the MTL of the
|
|
* change, to provide the opportunity to cleanup or release any
|
|
* resources associated with the peer.
|
|
*
|
|
* @param mtl (IN) MTL module
|
|
* @param nprocs (IN) Number of processes
|
|
* @param proc (IN) Set of processes
|
|
* @param peer (IN) Set of peer addressing information.
|
|
*
|
|
* @return Status indicating if cleanup was successful
|
|
*/
|
|
typedef int (*mca_mtl_base_module_del_procs_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs,
|
|
struct mca_mtl_base_endpoint_t **mtl_peer_data);
|
|
|
|
|
|
/**
|
|
* Blocking send to peer
|
|
*
|
|
* Blocking send (Call should not return until the user buffer may be
|
|
* used again). Standard MPI semantics must be met by this call, as
|
|
* mandated in the mode argument. There is one special mode argument,
|
|
* MCA_PML_BASE_SEND_COMPLETE, which requires local completion before
|
|
* the function can return. This is an optimization for coillective
|
|
* routines that can otherwise lead to degenerate performance for
|
|
* broadcast-based collectives.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param dest (IN) Destination rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param convertor (IN) Datatype convertor describing send datatype.
|
|
* Already prepared for send.
|
|
* @param mode (IN) Mode for send operation
|
|
*
|
|
* @return OMPI_SUCCESS or error value
|
|
*
|
|
* \note Open MPI is built around non-blocking operations. This
|
|
* function is provided for networks where progressing events outside
|
|
* of point-to-point (for example, collectives, I/O, one-sided) can
|
|
* occur without a progress function regularily being triggered. If
|
|
* this is not the case for the given network, this function pointer
|
|
* should be set to NULL and non-blocking sends be used.
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_send_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode);
|
|
|
|
|
|
/**
|
|
* Non-blocking send to peer
|
|
*
|
|
* Non-blocking send to peer. Standard MPI semantics must be met by
|
|
* this call, as mandated in the mode argument. There is one special
|
|
* mode argument, MCA_PML_BASE_SEND_COMPLETE, which requires local
|
|
* completion before the request is marked as complete.
|
|
*
|
|
* The PML will handle creation of the request, leaving the number of
|
|
* bytes requested in the module structure available for the MTL
|
|
* directly after the ompi_request_t structure. The PML will handle
|
|
* proper destruction of the request once it can safely be destructed
|
|
* (it has been completed and freeed by a call to REQUEST_FReE or
|
|
* TEST/WAIT). The MTL should remove all resources associated with
|
|
* the request when it is marked as completed.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param dest (IN) Destination rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param convertor (IN) Datatype convertor describing send datatype.
|
|
* Already prepared for send.
|
|
* @param mode (IN) Mode for send operation (see pml.h)
|
|
* @param blocking (IN) True if the call originated from a blocking
|
|
* call, but the PML decided to use a
|
|
* non-blocking operation. This is either for
|
|
* internal performance decisions or because the
|
|
* blocking send function is NULL. This is an
|
|
* optimization flag and is not needed for
|
|
* correctness.
|
|
* @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
|
|
* will be populated with an initialized
|
|
* ompi_request_t before calling.
|
|
*
|
|
* @return OMPI_SUCCESS or error value
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_isend_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode,
|
|
bool blocking,
|
|
mca_mtl_request_t *mtl_request);
|
|
|
|
|
|
/**
|
|
* Non-blocking receive
|
|
*
|
|
* Non-blocking receive function. Standard MPI semantics for
|
|
* MPI_Irecv must be implemented by this call.
|
|
*
|
|
* The PML will handle creation of the request, leaving the number of
|
|
* bytes requested in teh module structure available for the MTL,
|
|
* directly after the ompi_request_t structure. The PML will handle
|
|
* proper destruction of the request once it can safely be destroyed
|
|
* (it has been completed and free'ed by a call to REQUEST_FREE or
|
|
* TEST/WAIT). The MTL should remove all resources associated with
|
|
* the request when it is marked as completed.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param src (IN) Source rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param convertor (IN) Datatype convertor describing receive datatype.
|
|
* Already prepared for receive.
|
|
* @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
|
|
* will be populated with an initialized
|
|
* ompi_request_t before calling.
|
|
*
|
|
* @return OMPI_SUCCESS or error value
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used. Further, MPI_ANY_TAG should *not* match
|
|
* against negative tags.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_irecv_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int src,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
struct mca_mtl_request_t *mtl_request);
|
|
|
|
|
|
/**
|
|
* Non-blocking probe
|
|
*
|
|
* Non-blocking probe function. Standard MPI semantics for MPI_IPROBE
|
|
* must be implemented by this call.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param src (IN) Source rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param flag (OUT) true if message available, false otherwise
|
|
* @param status (OUT) Status structure for information on
|
|
* available message
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used. Further, MPI_ANY_TAG should *not* match
|
|
* against negative tags.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_iprobe_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int src,
|
|
int tag,
|
|
int *flag,
|
|
struct ompi_status_public_t *status);
|
|
|
|
|
|
/**
|
|
* Cancel an existing request
|
|
*
|
|
* Attempt to cancel an existing request. The (poorly defined)
|
|
* semantics for MPI_CANCEL must be implemented by this call. This,
|
|
* of course, allows the MTL module to do nothing at all.
|
|
* Implementations of the MTL should make a good faith effort to
|
|
* cancel receive requests that have not been started, as the "post a
|
|
* receive for control messages" paradigm is a common one in loosely
|
|
* coupled MPI applications.
|
|
*
|
|
* @param request(IN) Request that should be cancelled
|
|
* @param flag Unknown exactly what this does.
|
|
*
|
|
*/
|
|
typedef int (*mca_mtl_base_module_cancel_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
mca_mtl_request_t *mtl_request,
|
|
int flag);
|
|
|
|
|
|
/**
|
|
* MTL module interface functions and attributes.
|
|
*/
|
|
struct mca_mtl_base_module_t {
|
|
int mtl_max_contextid; /**< maximum allowable contextid */
|
|
int mtl_max_tag; /**< maximum tag value. note that negative tags must be allowed */
|
|
size_t mtl_request_size; /**< number of bytes to reserve with request structure */
|
|
|
|
uint32_t mtl_flags; /**< flags (put/get...) */
|
|
|
|
/* MTL function table */
|
|
mca_mtl_base_module_add_procs_fn_t mtl_add_procs;
|
|
mca_mtl_base_module_del_procs_fn_t mtl_del_procs;
|
|
mca_mtl_base_module_finalize_fn_t mtl_finalize;
|
|
|
|
mca_mtl_base_module_send_fn_t mtl_send;
|
|
mca_mtl_base_module_isend_fn_t mtl_isend;
|
|
mca_mtl_base_module_irecv_fn_t mtl_irecv;
|
|
mca_mtl_base_module_iprobe_fn_t mtl_iprobe;
|
|
|
|
/* Optional MTL functions */
|
|
mca_mtl_base_module_cancel_fn_t mtl_cancel;
|
|
};
|
|
typedef struct mca_mtl_base_module_t mca_mtl_base_module_t;
|
|
|
|
/*
|
|
* Macro for use in modules that are of type mtl
|
|
*/
|
|
#define MCA_MTL_BASE_VERSION_2_0_0 \
|
|
MCA_BASE_VERSION_2_0_0, \
|
|
"mtl", 2, 0, 0
|
|
|
|
/*
|
|
* macro for doing direct call / call through struct
|
|
*/
|
|
#if MCA_mtl_DIRECT_CALL
|
|
|
|
#include MCA_mtl_DIRECT_CALL_HEADER
|
|
|
|
#define OMPI_MTL_CALL_STAMP(a, b) ompi_mtl_ ## a ## _ ## b
|
|
#define OMPI_MTL_CALL_EXPANDER(a, b) OMPI_MTL_CALL_STAMP(a,b)
|
|
#define OMPI_MTL_CALL(a) OMPI_MTL_CALL_EXPANDER(MCA_mtl_DIRECT_CALL_COMPONENT, a)
|
|
|
|
#else
|
|
#define OMPI_MTL_CALL(a) ompi_mtl->mtl_ ## a
|
|
#endif
|
|
|
|
OMPI_DECLSPEC extern mca_mtl_base_module_t *ompi_mtl;
|
|
|
|
#if defined(c_plusplus) || defined(__cplusplus)
|
|
}
|
|
#endif
|
|
#endif
|