2e83cf15ce
PSM2 enables support for GPU buffers and CUDA managed memory and it can directly recognize GPU buffers, handle copies between HFIs and GPUs. Therefore, it is not required for OMPI to handle GPU buffers for pt2pt cases. In this patch, we allow the PSM2 MTL to specify when it does not require CUDA convertor support. This allows us to skip CUDA convertor init phases and lets PSM2 handle the memory transfers. This translates to improvements in latency. The patch enables blocking collectives and workloads with GPU contiguous, GPU non-contiguous memory. Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@intel.com>
459 строки
18 KiB
C
459 строки
18 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2006 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
|
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2017 Intel, Inc. All rights reserved
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
*
|
|
* Matching Transport Layer
|
|
*
|
|
* The Matching Transport Layer (MTL) provides device-layer support
|
|
* for transfer of MPI point-to-point messages over devices that
|
|
* support hardware / library message matching. This layer is used
|
|
* with the MTL PML component to provide lowest latency and highest
|
|
* bandwidth on given architectures. Features found in other PML
|
|
* interfaces, such as message fragmenting, multi-device support, and
|
|
* NIC failover are not provided by the upper layers.
|
|
*
|
|
* In general, this interface should not be used for transport layer
|
|
* support. Instead, the BTL interface should be used. The BTL
|
|
* interface allows for multiplexing between multiple users
|
|
* (point-to-point, one-sided, etc.) and provides many features not
|
|
* found in this interface (RDMA from arbitrary buffers, active
|
|
* messaging, reasonable pinned memory caching, etc.)
|
|
*/
|
|
|
|
#ifndef OMPI_MTL_H
|
|
#define OMPI_MTL_H
|
|
|
|
#include "ompi_config.h"
|
|
#include "mpi.h" /* needed for MPI_ANY_TAG */
|
|
#include "ompi/mca/mca.h"
|
|
#include "ompi/mca/pml/pml_constants.h" /* for send_mode enum */
|
|
#include "ompi/request/request.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
struct ompi_request_t;
|
|
struct opal_convertor_t;
|
|
|
|
struct mca_mtl_base_module_t;
|
|
|
|
struct mca_mtl_request_t {
|
|
/** pointer to associated ompi_request_t */
|
|
struct ompi_request_t *ompi_req;
|
|
void (*completion_callback)(struct mca_mtl_request_t* mtl_request);
|
|
};
|
|
typedef struct mca_mtl_request_t mca_mtl_request_t;
|
|
|
|
|
|
/**
|
|
* MTL module flags
|
|
*/
|
|
#define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001
|
|
#if OPAL_CUDA_SUPPORT
|
|
#define MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE 0x00000002
|
|
#endif
|
|
|
|
/**
|
|
* Initialization routine for MTL component
|
|
*
|
|
* Initialization routine for MTL component. This function should
|
|
* allocate resources for communication and try to do all local setup.
|
|
* It should not attempt to contact it's peers, as that should be
|
|
* done at add_procs time. Contact information should be published
|
|
* during this initialization function. It will be made available
|
|
* during add_procs().
|
|
*
|
|
* @param enable_progress_threads (IN) Progress threads have been
|
|
* enabled by the user and the component must be
|
|
* capable of making asycnhronous progress (either
|
|
* with its own thread, with the kernel, or with
|
|
* the event library.
|
|
* @param enable_mpi_threads (IN) MPI threads have been enabled by the
|
|
* user and the component must be capable of coping
|
|
* with threads. If the component can cope with
|
|
* MPI_THREAD_MULTIPLE, enable_mpi_thread_multiple
|
|
* should be set to true. Otherwise, it is assumed
|
|
* that only THREAD_FUNNELLED and THREAD_SERIALIZED
|
|
* can be used.
|
|
* @param enable_mpi_thread_multiple (OUT) Component does / does not
|
|
* support MPI_THREAD_MULTIPLE. This variable only
|
|
* needs to be set if enable_mpi_threads is true.
|
|
* Otherwise, the return value will be ignored.
|
|
*
|
|
* @retval NULL component can not operate on the current machine
|
|
* @retval non-NULL component interface function
|
|
*/
|
|
typedef struct mca_mtl_base_module_t*
|
|
(*mca_mtl_base_component_init_fn_t)(bool enable_progress_threads,
|
|
bool enable_mpi_threads);
|
|
|
|
|
|
struct mca_mtl_base_component_2_0_0_t {
|
|
mca_base_component_t mtl_version;
|
|
mca_base_component_data_t mtl_data;
|
|
mca_mtl_base_component_init_fn_t mtl_init;
|
|
};
|
|
typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_2_0_0_t;
|
|
typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_t;
|
|
|
|
|
|
/**
|
|
* MCA->MTL Clean up any resources held by MTL module
|
|
*
|
|
* Opposite of module_init. Called when communication will no longer
|
|
* be necessary. ussually this is during MPI_FINALIZE, but it can be
|
|
* earlier if the component was not selected to run. Assuming
|
|
* module_init was called, finalize will always be called before the
|
|
* component_close function is called.
|
|
*
|
|
* @param mtl (IN) MTL module returned from call to initialize
|
|
*
|
|
* @retval OMPI_SUCCESS cleanup finished successfully
|
|
* @retval other failure during cleanup
|
|
*
|
|
*/
|
|
typedef int (*mca_mtl_base_module_finalize_fn_t)(struct mca_mtl_base_module_t* mtl);
|
|
|
|
|
|
/**
|
|
* PML->MTL notification of change in the process list.
|
|
*
|
|
* The mca_mtl_base_module_add_procs_fn_t() is used by the PML to
|
|
* notify the MTL that new processes are connected to the current
|
|
* process. Any addressing information exported by the peer via the
|
|
* ompi_modex_send() function should be available during this
|
|
* call via the corresponding ompi_modex_recv() function. The
|
|
* MTL may utilize this information to determine reachability of each
|
|
* peer process.
|
|
*
|
|
* It is an error for a proc to not be reachable by the given MTL, and
|
|
* an error should be returned if that case is detected. If a MTL
|
|
* requires per-endpoint data, it must handle storage, either using a
|
|
* static endpoint tag (MTL is the default tag that should generally
|
|
* be used) or a dynamic endpoint tag (although it should be noted
|
|
* that OMPI can be built without dynamic endpoint tag support).
|
|
*
|
|
* @param mtl (IN) MTL module
|
|
* @param nprocs (IN) Number of processes
|
|
* @param procs (IN) Set of processes
|
|
*
|
|
* @retval OMPI_SUCCESS successfully connected to processes
|
|
* @retval other failure during setup
|
|
*/
|
|
typedef int (*mca_mtl_base_module_add_procs_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs);
|
|
|
|
|
|
/**
|
|
* Notification of change to the process list.
|
|
*
|
|
* When the process list changes, the PML notifies the MTL of the
|
|
* change, to provide the opportunity to cleanup or release any
|
|
* resources associated with the peer. The MTL is responsible for
|
|
* releasing any memory associated with the endpoint data it may have
|
|
* stored during add_procs().
|
|
*
|
|
* @param mtl (IN) MTL module
|
|
* @param nprocs (IN) Number of processes
|
|
* @param proc (IN) Set of processes
|
|
* @param peer (IN) Set of peer addressing information.
|
|
*
|
|
* @return Status indicating if cleanup was successful
|
|
*/
|
|
typedef int (*mca_mtl_base_module_del_procs_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
size_t nprocs,
|
|
struct ompi_proc_t** procs);
|
|
|
|
|
|
/**
|
|
* Blocking send to peer
|
|
*
|
|
* Blocking send (Call should not return until the user buffer may be
|
|
* used again). Standard MPI semantics must be met by this call, as
|
|
* mandated in the mode argument. There is one special mode argument,
|
|
* MCA_PML_BASE_SEND_COMPLETE, which requires local completion before
|
|
* the function can return. This is an optimization for coillective
|
|
* routines that can otherwise lead to degenerate performance for
|
|
* broadcast-based collectives.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param dest (IN) Destination rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param convertor (IN) Datatype convertor describing send datatype.
|
|
* Already prepared for send.
|
|
* @param mode (IN) Mode for send operation
|
|
*
|
|
* @return OMPI_SUCCESS or error value
|
|
*
|
|
* \note Open MPI is built around non-blocking operations. This
|
|
* function is provided for networks where progressing events outside
|
|
* of point-to-point (for example, collectives, I/O, one-sided) can
|
|
* occur without a progress function regularily being triggered.
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_send_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode);
|
|
|
|
|
|
/**
|
|
* Non-blocking send to peer
|
|
*
|
|
* Non-blocking send to peer. Standard MPI semantics must be met by
|
|
* this call, as mandated in the mode argument. There is one special
|
|
* mode argument, MCA_PML_BASE_SEND_COMPLETE, which requires local
|
|
* completion before the request is marked as complete.
|
|
*
|
|
* The PML will handle creation of the request, leaving the number of
|
|
* bytes requested in the module structure available for the MTL
|
|
* directly after the ompi_request_t structure. The PML will handle
|
|
* proper destruction of the request once it can safely be destructed
|
|
* (it has been completed and freeed by a call to REQUEST_FReE or
|
|
* TEST/WAIT). The MTL should remove all resources associated with
|
|
* the request when it is marked as completed.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param dest (IN) Destination rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param convertor (IN) Datatype convertor describing send datatype.
|
|
* Already prepared for send.
|
|
* @param mode (IN) Mode for send operation (see pml.h)
|
|
* @param blocking (IN) True if the call originated from a blocking
|
|
* call, but the PML decided to use a
|
|
* non-blocking operation, likely for
|
|
* internal performance decisions This is an
|
|
* optimization flag and is not needed for
|
|
* correctness.
|
|
* @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
|
|
* will be populated with an initialized
|
|
* ompi_request_t before calling.
|
|
*
|
|
* @return OMPI_SUCCESS or error value
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_isend_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int dest,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
mca_pml_base_send_mode_t mode,
|
|
bool blocking,
|
|
mca_mtl_request_t *mtl_request);
|
|
|
|
|
|
/**
|
|
* Non-blocking receive
|
|
*
|
|
* Non-blocking receive function. Standard MPI semantics for
|
|
* MPI_Irecv must be implemented by this call.
|
|
*
|
|
* The PML will handle creation of the request, leaving the number of
|
|
* bytes requested in teh module structure available for the MTL,
|
|
* directly after the ompi_request_t structure. The PML will handle
|
|
* proper destruction of the request once it can safely be destroyed
|
|
* (it has been completed and free'ed by a call to REQUEST_FREE or
|
|
* TEST/WAIT). The MTL should remove all resources associated with
|
|
* the request when it is marked as completed.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param src (IN) Source rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param convertor (IN) Datatype convertor describing receive datatype.
|
|
* Already prepared for receive.
|
|
* @param mtl_request (IN) Pointer to mtl_request. The ompi_req field
|
|
* will be populated with an initialized
|
|
* ompi_request_t before calling.
|
|
*
|
|
* @return OMPI_SUCCESS or error value
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used. Further, MPI_ANY_TAG should *not* match
|
|
* against negative tags.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_irecv_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int src,
|
|
int tag,
|
|
struct opal_convertor_t *convertor,
|
|
struct mca_mtl_request_t *mtl_request);
|
|
|
|
|
|
/**
|
|
* Non-blocking probe
|
|
*
|
|
* Non-blocking probe function. Standard MPI semantics for MPI_IPROBE
|
|
* must be implemented by this call.
|
|
*
|
|
* @param comm (IN) Communicator used for operation
|
|
* @param src (IN) Source rank for send (relative to comm)
|
|
* @param tag (IN) MPI tag used for sending. See note below.
|
|
* @param flag (OUT) true if message available, false otherwise
|
|
* @param status (OUT) Status structure for information on
|
|
* available message
|
|
*
|
|
* \note While MPI does not allow users to specify negative tags, they
|
|
* are used internally in Open MPI to provide a unique channel for
|
|
* collective operations. Therefore, the MTL can *not* cause an error
|
|
* if a negative tag is used. Further, MPI_ANY_TAG should *not* match
|
|
* against negative tags.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_iprobe_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int src,
|
|
int tag,
|
|
int *flag,
|
|
struct ompi_status_public_t *status);
|
|
|
|
|
|
typedef int (*mca_mtl_base_module_imrecv_fn_t)(struct mca_mtl_base_module_t* mtl,
|
|
struct opal_convertor_t *convertor,
|
|
struct ompi_message_t **message,
|
|
struct mca_mtl_request_t *mtl_request);
|
|
|
|
typedef int (*mca_mtl_base_module_improbe_fn_t)(struct mca_mtl_base_module_t *mtl,
|
|
struct ompi_communicator_t *comm,
|
|
int src,
|
|
int tag,
|
|
int *matched,
|
|
struct ompi_message_t **message,
|
|
struct ompi_status_public_t *status);
|
|
|
|
/**
|
|
* Cancel an existing request
|
|
*
|
|
* Attempt to cancel an existing request. The (poorly defined)
|
|
* semantics for MPI_CANCEL must be implemented by this call. This,
|
|
* of course, allows the MTL module to do nothing at all.
|
|
* Implementations of the MTL should make a good faith effort to
|
|
* cancel receive requests that have not been started, as the "post a
|
|
* receive for control messages" paradigm is a common one in loosely
|
|
* coupled MPI applications.
|
|
*
|
|
* @param request(IN) Request that should be cancelled
|
|
* @param flag Unknown exactly what this does.
|
|
*
|
|
*/
|
|
typedef int (*mca_mtl_base_module_cancel_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
mca_mtl_request_t *mtl_request,
|
|
int flag);
|
|
|
|
|
|
/**
|
|
* Downcall from PML layer when a new communicator is created.
|
|
*
|
|
* @param comm Communicator
|
|
* @return OMPI_SUCCESS or failure status.
|
|
*
|
|
* Provides the MTL the opportunity to initialize/cache a data structure
|
|
* on the communicator.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_add_comm_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t* comm);
|
|
|
|
|
|
/**
|
|
* Downcall from PML layer when a communicator is destroyed.
|
|
*
|
|
* @param comm Communicator
|
|
* @return OMPI_SUCCESS or failure status.
|
|
*
|
|
* Provides the MTL the opportunity to cleanup any datastructures
|
|
* associated with the communicator.
|
|
*/
|
|
typedef int (*mca_mtl_base_module_del_comm_fn_t)(
|
|
struct mca_mtl_base_module_t* mtl,
|
|
struct ompi_communicator_t* comm);
|
|
|
|
|
|
/**
|
|
* MTL module interface functions and attributes.
|
|
*/
|
|
struct mca_mtl_base_module_t {
|
|
int mtl_max_contextid; /**< maximum allowable contextid */
|
|
int mtl_max_tag; /**< maximum tag value. note that negative tags must be allowed */
|
|
size_t mtl_request_size; /**< number of bytes to reserve with request structure */
|
|
|
|
uint32_t mtl_flags; /**< flags (put/get...) */
|
|
|
|
/* MTL function table */
|
|
mca_mtl_base_module_add_procs_fn_t mtl_add_procs;
|
|
mca_mtl_base_module_del_procs_fn_t mtl_del_procs;
|
|
mca_mtl_base_module_finalize_fn_t mtl_finalize;
|
|
|
|
mca_mtl_base_module_send_fn_t mtl_send;
|
|
mca_mtl_base_module_isend_fn_t mtl_isend;
|
|
mca_mtl_base_module_irecv_fn_t mtl_irecv;
|
|
mca_mtl_base_module_iprobe_fn_t mtl_iprobe;
|
|
mca_mtl_base_module_imrecv_fn_t mtl_imrecv;
|
|
mca_mtl_base_module_improbe_fn_t mtl_improbe;
|
|
|
|
/* Optional MTL functions */
|
|
mca_mtl_base_module_cancel_fn_t mtl_cancel;
|
|
mca_mtl_base_module_add_comm_fn_t mtl_add_comm;
|
|
mca_mtl_base_module_del_comm_fn_t mtl_del_comm;
|
|
};
|
|
typedef struct mca_mtl_base_module_t mca_mtl_base_module_t;
|
|
|
|
/*
|
|
* Macro for use in modules that are of type mtl
|
|
*/
|
|
#define MCA_MTL_BASE_VERSION_2_0_0 \
|
|
OMPI_MCA_BASE_VERSION_2_1_0("mtl", 2, 0, 0)
|
|
|
|
OMPI_DECLSPEC extern mca_mtl_base_module_t *ompi_mtl;
|
|
|
|
/*
|
|
* macro for doing direct call / call through struct
|
|
*/
|
|
#if MCA_ompi_mtl_DIRECT_CALL
|
|
|
|
|
|
#define OMPI_MTL_CALL_STAMP(a, b) ompi_mtl_ ## a ## _ ## b
|
|
#define OMPI_MTL_CALL_EXPANDER(a, b) OMPI_MTL_CALL_STAMP(a,b)
|
|
#define OMPI_MTL_CALL(a) OMPI_MTL_CALL_EXPANDER(MCA_ompi_mtl_DIRECT_CALL_COMPONENT, a)
|
|
|
|
#include MCA_ompi_mtl_DIRECT_CALL_HEADER
|
|
|
|
#else
|
|
#define OMPI_MTL_CALL(a) ompi_mtl->mtl_ ## a
|
|
#endif
|
|
|
|
|
|
END_C_DECLS
|
|
#endif
|