c5c5b42307
This commit adds a new btl for one-sided and two-sided. This btl uses the uct layer in OpenUCX. This btl makes use of multiple uct contexts and per-thread device pinning to provide good performance when using threads and osc/rdma. This btl has been tested extensively with osc/rdma and passes all MTT tests on aries and IB hardware. For now this new component disables itself but can be enabled by setting the btl_ucx_transports MCA variable with a comma-delimited list of supported memory domains/transport layers. For example: --mca btl_uct_memory_domains ib/mlx5_0. The specific transports used can be selected using --mca btl_uct_transports. The default is to use any available transport. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
328 строки
12 KiB
C
328 строки
12 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file
|
|
*/
|
|
#ifndef MCA_BTL_UCT_H
|
|
#define MCA_BTL_UCT_H
|
|
|
|
#include "opal_config.h"
|
|
#include <sys/types.h>
|
|
#include <string.h>
|
|
|
|
/* Open MPI includes */
|
|
#include "opal/mca/event/event.h"
|
|
#include "opal/mca/btl/base/base.h"
|
|
#include "opal/mca/mpool/mpool.h"
|
|
#include "opal/mca/btl/base/btl_base_error.h"
|
|
#include "opal/mca/rcache/base/base.h"
|
|
#include "opal/class/opal_fifo.h"
|
|
#include "opal/class/opal_hash_table.h"
|
|
#include "opal/mca/pmix/pmix.h"
|
|
#include "opal/threads/tsd.h"
|
|
#include <ucp/api/ucp.h>
|
|
#include <uct/api/uct.h>
|
|
|
|
#include "btl_uct_types.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/* detection for old vs new atomic flags */
|
|
#if defined(UCT_IFACE_FLAG_ATOMIC_ADD32)
|
|
#define OPAL_HAVE_UCT_EP_ATOMIC64_POST 0
|
|
#else
|
|
#define OPAL_HAVE_UCT_EP_ATOMIC64_POST 1
|
|
#endif
|
|
|
|
/**
|
|
* @brief UCT BTL module
|
|
*/
|
|
struct mca_btl_uct_module_t {
|
|
/** base BTL interface */
|
|
mca_btl_base_module_t super;
|
|
|
|
/** whether the module has been fully initialized or not */
|
|
bool initialized;
|
|
|
|
/** lock for the hash table */
|
|
opal_mutex_t endpoint_lock;
|
|
|
|
/** endpoint hash table */
|
|
opal_hash_table_t id_to_endpoint;
|
|
|
|
/** mutex to protect the module */
|
|
opal_mutex_t lock;
|
|
|
|
/** async context */
|
|
ucs_async_context_t *ucs_async;
|
|
|
|
/** transport for active messaging */
|
|
mca_btl_uct_tl_t *am_tl;
|
|
|
|
/** transport for RDMA/AMOs */
|
|
mca_btl_uct_tl_t *rdma_tl;
|
|
|
|
/** transport for forming connections (if needed) */
|
|
mca_btl_uct_tl_t *conn_tl;
|
|
|
|
/** array containing the am_tl and rdma_tl */
|
|
mca_btl_uct_tl_t *comm_tls[2];
|
|
|
|
/** registration cache */
|
|
mca_rcache_base_module_t *rcache;
|
|
|
|
/** name of the memory domain backing this module */
|
|
char *md_name;
|
|
|
|
/** am and rdma share endpoints */
|
|
bool shared_endpoints;
|
|
|
|
/** memory domain */
|
|
mca_btl_uct_md_t *md;
|
|
|
|
/** un-registered frags that will be used with uct_ep_am_short() */
|
|
opal_free_list_t short_frags;
|
|
|
|
/** registered frags that will be used with uct_ep_am_zcopy() */
|
|
opal_free_list_t eager_frags;
|
|
|
|
/** large registered frags for packing non-contiguous data */
|
|
opal_free_list_t max_frags;
|
|
|
|
/** RDMA completions */
|
|
opal_free_list_t rdma_completions;
|
|
|
|
/** frags that were waiting on connections that are now ready to send */
|
|
opal_list_t pending_frags;
|
|
};
|
|
typedef struct mca_btl_uct_module_t mca_btl_uct_module_t;
|
|
|
|
extern mca_btl_uct_module_t mca_btl_uct_module_template;
|
|
|
|
/**
|
|
* @brief UCT BTL component
|
|
*/
|
|
struct mca_btl_uct_component_t {
|
|
/** base BTL component */
|
|
mca_btl_base_component_3_0_0_t super;
|
|
|
|
/** number of TL modules */
|
|
int module_count;
|
|
|
|
/** All BTL UCT modules (1 per memory domain) */
|
|
mca_btl_uct_module_t *modules[MCA_BTL_UCT_MAX_MODULES];
|
|
|
|
/** allowed UCT memory domains */
|
|
char *memory_domains;
|
|
|
|
/** allowed transports */
|
|
char *allowed_transports;
|
|
|
|
/** number of worker contexts to create */
|
|
int num_contexts_per_module;
|
|
|
|
#if OPAL_C_HAVE__THREAD_LOCAL
|
|
/** bind threads to contexts */
|
|
bool bind_threads_to_contexts;
|
|
#endif
|
|
|
|
/** disable UCX memory hooks */
|
|
bool disable_ucx_memory_hooks;
|
|
};
|
|
typedef struct mca_btl_uct_component_t mca_btl_uct_component_t;
|
|
|
|
OPAL_MODULE_DECLSPEC extern mca_btl_uct_component_t mca_btl_uct_component;
|
|
|
|
struct mca_btl_base_registration_handle_t {
|
|
/** The packed memory handle. The size of this field is defined by UCT. */
|
|
uint8_t packed_handle[1];
|
|
};
|
|
|
|
struct mca_btl_uct_reg_t {
|
|
mca_rcache_base_registration_t base;
|
|
|
|
/** UCT memory handle */
|
|
uct_mem_h uct_memh;
|
|
|
|
/** remote handle */
|
|
mca_btl_base_registration_handle_t handle;
|
|
};
|
|
typedef struct mca_btl_uct_reg_t mca_btl_uct_reg_t;
|
|
|
|
OBJ_CLASS_DECLARATION(mca_btl_uct_reg_t);
|
|
|
|
#define MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(reg) ((mca_btl_uct_reg_t *)((intptr_t) (reg) - offsetof (mca_btl_uct_reg_t, handle)))
|
|
|
|
/**
|
|
* Initiate an asynchronous put.
|
|
* Completion Semantics: if this function returns a 1 then the operation
|
|
* is complete. a return of OPAL_SUCCESS indicates
|
|
* the put operation has been queued with the
|
|
* network. the local_handle can not be deregistered
|
|
* until all outstanding operations on that handle
|
|
* have been completed.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param endpoint (IN) BTL addressing information
|
|
* @param local_address (IN) Local address to put from (registered)
|
|
* @param remote_address (IN) Remote address to put to (registered remotely)
|
|
* @param local_handle (IN) Registration handle for region containing
|
|
* (local_address, local_address + size)
|
|
* @param remote_handle (IN) Remote registration handle for region containing
|
|
* (remote_address, remote_address + size)
|
|
* @param size (IN) Number of bytes to put
|
|
* @param flags (IN) Flags for this put operation
|
|
* @param order (IN) Ordering
|
|
* @param cbfunc (IN) Function to call on completion (if queued)
|
|
* @param cbcontext (IN) Context for the callback
|
|
* @param cbdata (IN) Data for callback
|
|
*
|
|
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
|
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
|
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
|
* operation. Try again later
|
|
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
|
* alignment restrictions.
|
|
*/
|
|
int mca_btl_uct_put (struct mca_btl_base_module_t *btl,
|
|
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
|
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
|
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
|
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
/**
|
|
* Initiate an asynchronous get.
|
|
* Completion Semantics: if this function returns a 1 then the operation
|
|
* is complete. a return of OPAL_SUCCESS indicates
|
|
* the get operation has been queued with the
|
|
* network. the local_handle can not be deregistered
|
|
* until all outstanding operations on that handle
|
|
* have been completed.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param endpoint (IN) BTL addressing information
|
|
* @param local_address (IN) Local address to put from (registered)
|
|
* @param remote_address (IN) Remote address to put to (registered remotely)
|
|
* @param local_handle (IN) Registration handle for region containing
|
|
* (local_address, local_address + size)
|
|
* @param remote_handle (IN) Remote registration handle for region containing
|
|
* (remote_address, remote_address + size)
|
|
* @param size (IN) Number of bytes to put
|
|
* @param flags (IN) Flags for this put operation
|
|
* @param order (IN) Ordering
|
|
* @param cbfunc (IN) Function to call on completion (if queued)
|
|
* @param cbcontext (IN) Context for the callback
|
|
* @param cbdata (IN) Data for callback
|
|
*
|
|
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
|
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
|
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
|
* operation. Try again later
|
|
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
|
* alignment restrictions.
|
|
*/
|
|
int mca_btl_uct_get (struct mca_btl_base_module_t *btl,
|
|
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
|
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
|
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
|
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
/**
|
|
* Fault Tolerance Event Notification Function
|
|
* @param state Checkpoint Stae
|
|
* @return OPAL_SUCCESS or failure status
|
|
*/
|
|
int mca_btl_uct_ft_event(int state);
|
|
|
|
int mca_btl_uct_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
|
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
|
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
|
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
|
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
|
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
|
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
|
void *cbcontext, void *cbdata);
|
|
|
|
int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
|
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
|
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
|
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
|
|
int mca_btl_uct_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
|
|
int mca_btl_uct_flush_thread (mca_btl_base_module_t *btl);
|
|
|
|
int mca_btl_uct_finalize (mca_btl_base_module_t *btl);
|
|
|
|
int mca_btl_uct_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg);
|
|
int mca_btl_uct_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
|
|
|
|
ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsigned flags);
|
|
|
|
struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc);
|
|
|
|
int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count);
|
|
|
|
/**
|
|
* @brief Checks if a tl is suitable for using for RDMA
|
|
*
|
|
* @param[in] tl btl/uct tl pointer
|
|
*/
|
|
static inline bool mca_btl_uct_tl_supports_rdma (mca_btl_uct_tl_t *tl)
|
|
{
|
|
return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY)) ==
|
|
(UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY);
|
|
}
|
|
|
|
/**
|
|
* @brief Checks if a tl is suitable for using for active messaging
|
|
*/
|
|
static inline bool mca_btl_uct_tl_support_am (mca_btl_uct_tl_t *tl)
|
|
{
|
|
return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY));
|
|
}
|
|
|
|
/**
|
|
* @brief Checks if a tl can be used for passing data to connect endpoints
|
|
*
|
|
* @param[in] tl btl/uct tl pointer
|
|
*/
|
|
static inline bool mca_btl_uct_tl_supports_conn (mca_btl_uct_tl_t *tl)
|
|
{
|
|
return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) ==
|
|
(UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE);
|
|
}
|
|
|
|
/**
|
|
* @brief Check if tl endpoints need to be connected via a connection tl
|
|
*
|
|
* @param[in] tl btl/uct tl pointer
|
|
*/
|
|
static inline bool mca_btl_uct_tl_requires_connection_tl (mca_btl_uct_tl_t *tl)
|
|
{
|
|
return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE);
|
|
}
|
|
|
|
END_C_DECLS
|
|
#endif
|