1
1

Update the interface to provide a cleaner interface for RDMA operations.

The old BTL interface provided support for RDMA through the use of
the btl_prepare_src and btl_prepare_dst functions. These functions were
expected to prepare as much of the user buffer as possible for the RDMA
operation and return a descriptor. The descriptor contained segment
information on the prepared region. The btl user could then pass the
RDMA segment information to a remote peer. Once the peer received that
information it then packed it into a similar descriptor on the other
side that could then be passed into a single btl_put or btl_get
operation.

Changes:

 - Removed the btl_prepare_dst function. This reflects the fact that
   RDMA operations no longer depend on "prepared" descriptors.

 - Removed the btl_seg_size member. There is no need to btl's to
   subclass the mca_btl_base_segment_t class anymore.

...

Add more
Этот коммит содержится в:
Nathan Hjelm 2014-10-07 09:25:56 -06:00 коммит произвёл Nathan Hjelm
родитель 0338bc80b7
Коммит 2d381f800f
21 изменённых файлов: 718 добавлений и 633 удалений

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -74,6 +75,39 @@ int mca_btl_base_param_register(mca_base_component_t *version,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_eager_limit);
if ((module->btl_flags & MCA_BTL_FLAGS_GET) && module->btl_get) {
if (0 == module->btl_get_limit) {
module->btl_get_limit = SIZE_MAX;
}
(void) mca_base_component_var_register(version, "get_limit", "Maximum size (in bytes) for btl get",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_get_limit);
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
* function. */
(void) mca_base_component_var_register(version, "get_alignment", "Alignment required for btl get",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_get_alignment);
}
if ((module->btl_flags & MCA_BTL_FLAGS_PUT) && module->btl_put) {
if (0 == module->btl_put_limit) {
module->btl_put_limit = SIZE_MAX;
}
(void) mca_base_component_var_register(version, "put_limit", "Maximum size (in bytes) for btl put",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_put_limit);
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
* function. */
(void) mca_base_component_var_register(version, "put_alignment", "Alignment required for btl put",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_put_alignment);
}
#if OPAL_CUDA_GDR_SUPPORT
/* If no CUDA RDMA support, zero them out */
if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
@ -144,5 +178,13 @@ int mca_btl_base_param_verify(mca_btl_base_module_t *module)
module->btl_flags &= ~MCA_BTL_FLAGS_GET;
}
if (0 == module->btl_get_limit) {
module->btl_get_limit = SIZE_MAX;
}
if (0 == module->btl_put_limit) {
module->btl_put_limit = SIZE_MAX;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -134,6 +134,23 @@ struct mca_btl_base_descriptor_t;
struct mca_mpool_base_resources_t;
struct opal_proc_t;
/**
* Opaque registration handle for executing RDMA and atomic
* operations on a memory region.
*
* This data inside this handle is appropriate for passing
* to remote peers to execute RDMA and atomic operations. The
* size needed to send the registration handle can be
* obtained from the btl via the btl_registration_handle_size
* member. If this size is 0 then no registration data is
* needed to execute RDMA or atomic operations.
*/
struct mca_btl_base_registration_handle_t;
typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
/* Wildcard endpoint for use in the register_mem function */
#define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
/* send/recv operations require tag matching */
typedef uint8_t mca_btl_base_tag_t;
@ -219,6 +236,30 @@ typedef uint8_t mca_btl_base_tag_t;
#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
#define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
/** registration flags */
enum {
/** Allow local write on the registered region. If a region is registered
* with this flag the registration can be used as the local handle for a
* btl_get operation. */
MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x1,
/** Allow remote read on the registered region. If a region is registered
* with this flag the registration can be used as the remote handle for a
* btl_get operation. */
MCA_BTL_REG_FLAG_REMOTE_READ = 0x2,
/** Allow remote write on the registered region. If a region is registered
* with this flag the registration can be used as the remote handle for a
* btl_put operation. */
MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x4,
/** Allow remote atomic operations on the registered region. If a region is
* registered with this flag the registration can be used as the remote
* handle for a btl_atomic_op or btl_atomic_fop operation. */
MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x8,
/** Allow any btl operation on the registered region. If a region is registered
* with this flag the registration can be used as the local or remote handle for
* any btl operation. */
MCA_BTL_REG_FLAG_ACCESS_ANY = 0xf,
};
/**
* Asynchronous callback function on completion of an operation.
* Completion Semantics: The descriptor can be reused or returned to the
@ -237,6 +278,32 @@ typedef void (*mca_btl_base_completion_fn_t)(
struct mca_btl_base_descriptor_t* descriptor,
int status);
/**
* Asynchronous callback function on completion of an rdma or atomic operation.
* Completion Semantics: The rdma or atomic memory operation has completed
* remotely (i.e.) is remotely visible and the caller is free to deregister
* the local_handle or modify the memory in local_address.
*
* @param[IN] module the BTL module
* @param[IN] endpoint the BTL endpoint
* @param[IN] local_address local address for the operation (if any)
* @param[IN] local_handle local handle associated with the local_address
* @param[IN] context callback context supplied to the rdma/atomic operation
* @param[IN] cbdata callback data supplied to the rdma/atomic operation
* @param[IN] status status of the operation
*
*/
typedef void (*mca_btl_base_rdma_completion_fn_t)(
struct mca_btl_base_module_t* module,
struct mca_btl_base_endpoint_t* endpoint,
void *local_address,
struct mca_btl_base_registration_handle_t *local_handle,
void *context,
void *cbdata,
int status);
/**
* Describes a region/segment of memory that is addressable
* by an BTL.
@ -262,6 +329,7 @@ struct mca_btl_base_segment_t {
};
typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
/**
* A descriptor that holds the parameters to a send/put/get
* operation along w/ a callback routine that is called on
@ -329,6 +397,11 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
*/
#define MCA_BTL_SEG_MAX_SIZE 256
/**
* Maximum size of a BTL registration handle in bytes
*/
#define MCA_BTL_REG_HANDLE_MAX_SIZE 256
/*
* BTL base header, stores the tag at a minimum
*/
@ -655,6 +728,43 @@ typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
uint32_t flags
);
/**
* @brief Register a memory region for put/get/atomic operations.
*
* @param btl (IN) BTL module
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
* @param base (IN) Pointer to start of region
* @param size (IN) Size of region
* @param flags (IN) Flags indicating what operation will be performed. Valid
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
* and MCA_BTL_DES_FLAGS_ATOMIC
*
* @returns a memory registration handle valid for both local and remote operations
* @returns NULL if the region could not be registered
*
* This function registers the specified region with the hardware for use with
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
* functions. Care should be taken to not hold an excessive number of registrations
* as they may use limited system/NIC resources.
*/
typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
size_t size, uint32_t flags);
/**
* @brief Deregister a memory region
*
* @param btl (IN) BTL module region was registered with
* @param handle (IN) BTL registration handle to deregister
*
* This function deregisters the memory region associated with the specified handle. Care
* should be taken to not perform any RDMA or atomic operation on this memory region
* after it is deregistered. It is erroneous to specify a memory handle associated with
* a remote node.
*/
typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
/**
* Initiate an asynchronous send.
* Completion Semantics: the descriptor has been queued for a send operation
@ -722,9 +832,12 @@ typedef int (*mca_btl_base_module_sendi_fn_t)(
/**
* Initiate an asynchronous put.
* Completion Semantics: the descriptor has been queued for a put operation
* the BTL now controls the descriptor until local
* completion callback is made on the descriptor
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the put operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
*
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
* allow multiple concurrent put operations on the same descriptor.
@ -732,19 +845,32 @@ typedef int (*mca_btl_base_module_sendi_fn_t)(
* a corresponding prepare_src/dst call for each put operation and
* therefore prohibit multiple concurrent put operations.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (IN) Local address to put from (registered)
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param local_handle (IN) Registration handle for region containing
* (local_address, local_address + size)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + size)
* @param size (IN) Number of bytes to put
* @param flags (IN) Flags for this put operation
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/
typedef int (*mca_btl_base_module_put_fn_t)(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor
);
typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Initiate an asynchronous get.
@ -767,13 +893,47 @@ typedef int (*mca_btl_base_module_put_fn_t)(
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a get
*
*/
typedef int (*mca_btl_base_module_get_fn_t)(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor
);
/**
* Initiate an asynchronous get.
* Completion Semantics: if this function returns a 1 then the operation
* is complete. a return of OPAL_SUCCESS indicates
* the get operation has been queued with the
* network. the local_handle can not be deregistered
* until all outstanding operations on that handle
* have been completed.
*
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
* allow multiple concurrent put operations on the same descriptor.
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
* a corresponding prepare_src/dst call for each put operation and
* therefore prohibit multiple concurrent put operations.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param local_address (IN) Local address to put from (registered)
* @param remote_address (IN) Remote address to put to (registered remotely)
* @param local_handle (IN) Registration handle for region containing
* (local_address, local_address + size)
* @param remote_handle (IN) Remote registration handle for region containing
* (remote_address, remote_address + size)
* @param size (IN) Number of bytes to put
* @param flags (IN) Flags for this put operation
* @param cbfunc (IN) Function to call on completion (if queued)
* @param cbcontext (IN) Context for the callback
* @param cbdata (IN) Data for callback
*
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
* operation. Try again later
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
* alignment restrictions.
*/
typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
/**
* Diagnostic dump of btl state.
@ -813,7 +973,13 @@ struct mca_btl_base_module_t {
uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */
uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
uint32_t btl_flags; /**< flags (put/get...) */
size_t btl_seg_size; /**< size of a btl segment */
size_t btl_registration_handle_size; /**< size of the BTLs registration handles */
/* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
size_t btl_get_limit; /**< maximum size supported by the btl_get function */
size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */
size_t btl_put_limit; /**< maximum size supported by the btl_put function */
size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */
/* BTL function table */
mca_btl_base_module_add_procs_fn_t btl_add_procs;
@ -824,13 +990,16 @@ struct mca_btl_base_module_t {
mca_btl_base_module_alloc_fn_t btl_alloc;
mca_btl_base_module_free_fn_t btl_free;
mca_btl_base_module_prepare_fn_t btl_prepare_src;
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
mca_btl_base_module_send_fn_t btl_send;
mca_btl_base_module_sendi_fn_t btl_sendi;
mca_btl_base_module_put_fn_t btl_put;
mca_btl_base_module_get_fn_t btl_get;
mca_btl_base_module_dump_fn_t btl_dump;
/* new memory registration functions */
mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */
mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
/** the mpool associated with this btl (optional) */
mca_mpool_base_module_t* btl_mpool;
/** register a default error handler */

Просмотреть файл

@ -38,13 +38,17 @@
#include "btl_self_frag.h"
#include "opal/util/proc.h"
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des);
int mca_btl_self_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
static int mca_btl_self_get (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des);
int mca_btl_self_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
mca_btl_base_module_t mca_btl_self = {
.btl_component = &mca_btl_self_component.super,
@ -54,7 +58,6 @@ mca_btl_base_module_t mca_btl_self = {
.btl_alloc = mca_btl_self_alloc,
.btl_free = mca_btl_self_free,
.btl_prepare_src = mca_btl_self_prepare_src,
.btl_prepare_dst = mca_btl_self_prepare_dst,
.btl_send = mca_btl_self_send,
.btl_put = mca_btl_self_put,
.btl_get = mca_btl_self_get,
@ -236,39 +239,6 @@ mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
return &frag->base;
}
/**
* Prepare data for receive.
*/
struct mca_btl_base_descriptor_t*
mca_btl_self_prepare_dst( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct opal_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size,
uint32_t flags )
{
mca_btl_self_frag_t* frag;
size_t max_data = *size;
void *ptr;
MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
if(OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
/* setup descriptor to point directly to user buffer */
opal_convertor_get_current_pointer( convertor, &ptr );
frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
frag->segment.seg_len = reserve + max_data;
frag->base.des_local = &frag->segment;
frag->base.des_local_count = 1;
frag->base.des_flags = flags;
return &frag->base;
}
/**
* Initiate a send to the peer.
@ -305,100 +275,31 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
return 1;
}
/**
* Initiate a put to the peer.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
static int mca_btl_self_rdma( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des,
mca_btl_base_segment_t* src, size_t src_cnt,
mca_btl_base_segment_t* dst, size_t dst_cnt)
int mca_btl_self_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
unsigned char* src_addr = (unsigned char *)(uintptr_t) src->seg_addr.lval;
size_t src_len = src->seg_len;
unsigned char* dst_addr = (unsigned char *)(uintptr_t) dst->seg_addr.lval;
size_t dst_len = dst->seg_len;
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
memcpy ((void *)(intptr_t) remote_address, local_address, size);
while(src_len && dst_len) {
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
if(src_len == dst_len) {
memcpy(dst_addr, src_addr, src_len);
/* advance src */
if(--src_cnt != 0) {
src++;
src_addr = (unsigned char*)src->seg_addr.pval;
src_len = src->seg_len;
} else {
src_len = 0;
}
/* advance dst */
if(--dst_cnt != 0) {
dst++;
dst_addr = (unsigned char*)dst->seg_addr.pval;
dst_len = dst->seg_len;
} else {
dst_len = 0;
}
} else {
size_t bytes = src_len < dst_len ? src_len : dst_len;
memcpy(dst_addr, src_addr, bytes);
/* advance src */
src_len -= bytes;
if(src_len == 0) {
if(--src_cnt != 0) {
src++;
src_addr = (unsigned char*)src->seg_addr.pval;
src_len = src->seg_len;
}
} else {
src_addr += bytes;
}
/* advance dst */
dst_len -= bytes;
if(dst_len == 0) {
if(--dst_cnt != 0) {
dst++;
dst_addr = (unsigned char*)src->seg_addr.pval;
dst_len = src->seg_len;
}
} else {
dst_addr += bytes;
}
}
}
/* rdma completion */
des->des_cbfunc( btl, endpoint, des, OPAL_SUCCESS );
if( btl_ownership ) {
mca_btl_self_free( btl, des );
}
return OPAL_SUCCESS;
}
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* des)
int mca_btl_self_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
return mca_btl_self_rdma (btl, endpoint, des, des->des_local, des->des_local_count,
des->des_remote, des->des_remote_count);
}
memcpy (local_address, (void *)(intptr_t) remote_address, size);
static int mca_btl_self_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
{
return mca_btl_self_rdma (btl, endpoint, des, des->des_remote, des->des_remote_count,
des->des_local, des->des_local_count);
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS;
}
int mca_btl_self_ft_event(int state) {

Просмотреть файл

@ -99,7 +99,6 @@ static int mca_btl_self_component_register(void)
mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX;
mca_btl_self.btl_min_rdma_pipeline_size = 0;
mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_self.btl_seg_size = sizeof (mca_btl_base_segment_t);
mca_btl_self.btl_bandwidth = 100;
mca_btl_self.btl_latency = 0;
mca_btl_base_param_register(&mca_btl_self_component.super.btl_version,

Просмотреть файл

@ -251,7 +251,6 @@ static int sm_register(void)
mca_btl_sm.super.btl_rdma_pipeline_frag_size = 64*1024;
mca_btl_sm.super.btl_min_rdma_pipeline_size = 64*1024;
mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND;
mca_btl_sm.super.btl_seg_size = sizeof (mca_btl_sm_segment_t);
mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */
mca_btl_sm.super.btl_latency = 1; /* Microsecs */

Просмотреть файл

@ -33,6 +33,7 @@
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/class/opal_hash_table.h"
#include "opal/class/ompi_free_list.h"
#include "opal/class/opal_free_list.h"
#include "opal/mca/common/ugni/common_ugni.h"
#include <errno.h>
@ -80,6 +81,11 @@ typedef struct mca_btl_ugni_module_t {
opal_mutex_t eager_get_pending_lock;
opal_list_t eager_get_pending;
opal_mutex_t pending_descriptors_lock;
opal_list_t pending_descriptors;
ompi_free_list_t post_descriptors;
mca_mpool_base_module_t *smsg_mpool;
ompi_free_list_t smsg_mboxes;
@ -143,8 +149,6 @@ typedef struct mca_btl_ugni_component_t {
/* After this message size switch to BTE protocols */
size_t ugni_fma_limit;
/* Switch to put when trying to GET at or above this size */
size_t ugni_get_limit;
/* Switch to get when sending above this size */
size_t ugni_smsg_limit;
@ -269,10 +273,13 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle,
size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
/**
* Initiate a put operation.
@ -283,10 +290,13 @@ mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
int mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle,
size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata);
int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
@ -295,9 +305,14 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
uint8_t order, size_t size, uint32_t flags);
struct mca_btl_base_registration_handle_t {
/** uGNI memory handle */
gni_mem_handle_t gni_handle;
};
typedef struct mca_btl_ugni_reg_t {
mca_mpool_base_registration_t base;
gni_mem_handle_t memory_hdl;
mca_btl_base_registration_handle_t handle;
} mca_btl_ugni_reg_t;
/* Global structures */

Просмотреть файл

@ -188,7 +188,7 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
-1, &(ugni_reg->memory_hdl));
-1, &(ugni_reg->handle.gni_handle));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
@ -211,7 +211,7 @@ static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size,
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1,
&(ugni_reg->memory_hdl));
&(ugni_reg->handle.gni_handle));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return opal_common_rc_ugni_to_opal (rc);
}
@ -224,7 +224,7 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
gni_return_t rc;
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->memory_hdl);
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->handle.gni_handle);
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (GNI_RC_SUCCESS != rc) {
return OPAL_ERROR;
@ -401,6 +401,15 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
return rc;
}
rc = ompi_free_list_init_new (&ugni_module->post_descriptors,
sizeof (mca_btl_ugni_post_descriptor_t),
8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t),
0, 0, 0, -1, 256, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_ERROR(("error creating post descriptor free list"));
return rc;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -52,6 +52,7 @@ static int
btl_ugni_component_register(void)
{
mca_base_var_enum_t *new_enum;
gni_nic_device_t device_type;
int rc;
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
@ -139,15 +140,6 @@ btl_ugni_component_register(void)
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_fma_limit);
mca_btl_ugni_component.ugni_get_limit = 1 * 1024 * 1024;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"get_limit", "Maximum size message that "
"will be sent using a get protocol "
"(default 1M)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_ugni_component.ugni_get_limit);
mca_btl_ugni_component.rdma_max_retries = 16;
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
@ -212,13 +204,24 @@ btl_ugni_component_register(void)
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
/* determine if there are get alignment restrictions */
GNI_GetDeviceType (&device_type);
if (GNI_DEVICE_GEMINI == device_type) {
mca_btl_ugni_module.super.btl_get_alignment = 4;
} else {
mca_btl_ugni_module.super.btl_get_alignment = 0;
}
/* threshold for put */
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_ugni_module.super.btl_seg_size = sizeof (mca_btl_ugni_segment_t);
mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
@ -425,89 +428,107 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
return count;
}
static inline int
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
#if OPAL_ENABLE_DEBUG
static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
{
opal_common_ugni_post_desc_t *desc;
mca_btl_ugni_base_frag_t *frag;
gni_cq_entry_t event_data = 0;
uint32_t recoverable = 1;
gni_return_t rc;
gni_cq_handle_t the_cq;
the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq;
fprintf (stderr, "desc->desc.base.post_id = %" PRIx64 "\n", desc->desc.base.post_id);
fprintf (stderr, "desc->desc.base.status = %" PRIx64 "\n", desc->desc.base.status);
fprintf (stderr, "desc->desc.base.cq_mode_complete = %hu\n", desc->desc.base.cq_mode_complete);
fprintf (stderr, "desc->desc.base.type = %d\n", desc->desc.base.type);
fprintf (stderr, "desc->desc.base.cq_mode = %hu\n", desc->desc.base.cq_mode);
fprintf (stderr, "desc->desc.base.dlvr_mode = %hu\n", desc->desc.base.dlvr_mode);
fprintf (stderr, "desc->desc.base.local_addr = %" PRIx64 "\n", desc->desc.base.local_addr);
fprintf (stderr, "desc->desc.base.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.local_mem_hndl.qword1,
desc->desc.base.local_mem_hndl.qword2);
fprintf (stderr, "desc->desc.base.remote_addr = %" PRIx64 "\n", desc->desc.base.remote_addr);
fprintf (stderr, "desc->desc.base.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.remote_mem_hndl.qword1,
desc->desc.base.remote_mem_hndl.qword2);
fprintf (stderr, "desc->desc.base.length = %" PRIu64 "\n", desc->desc.base.length);
fprintf (stderr, "desc->desc.base.rdma_mode = %hu\n", desc->desc.base.rdma_mode);
fprintf (stderr, "desc->desc.base.amo_cmd = %d\n", desc->desc.base.amo_cmd);
}
#endif
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
{
mca_btl_ugni_post_descriptor_t *post_desc = NULL;
gni_cq_entry_t event_data = 0;
gni_post_descriptor_t *desc;
uint32_t recoverable = 1;
gni_return_t grc;
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
rc = GNI_CqGetEvent (the_cq, &event_data);
if (GNI_RC_NOT_DONE == rc) {
grc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data);
if (GNI_RC_NOT_DONE == grc) {
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return 0;
}
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
/* TODO -- need to handle overrun -- how do we do this without an event?
will the event eventually come back? Ask Cray */
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc]));
BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc]));
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
return opal_common_rc_ugni_to_opal (rc);
return opal_common_rc_ugni_to_opal (grc);
}
rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc);
grc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, &desc);
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
return opal_common_rc_ugni_to_opal (rc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) {
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
return opal_common_rc_ugni_to_opal (grc);
}
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
char buffer[1024];
post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) {
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
GNI_CqErrorStr(event_data,buffer,sizeof(buffer));
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
!recoverable)) {
char char_buffer[1024];
GNI_CqErrorStr (event_data, char_buffer, 1024);
/* give up */
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
mca_btl_ugni_frag_complete (frag, OPAL_ERROR);
BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc,
recoverable, char_buffer));
#if OPAL_ENABLE_DEBUG
btl_ugni_dump_post_desc (post_desc);
#endif
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR);
return OPAL_ERROR;
}
/* repost transaction */
mca_btl_ugni_repost (frag);
mca_btl_ugni_repost (ugni_module, post_desc);
return 0;
}
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc));
return 1;
}
static inline int
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module)
{
int count = opal_list_get_size (&ugni_module->failed_frags);
int count = opal_list_get_size (&ugni_module->pending_descriptors);
int i;
for (i = 0 ; i < count ; ++i) {
OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock);
mca_btl_ugni_base_frag_t *frag =
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
if (NULL == frag) {
OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock);
mca_btl_ugni_post_descriptor_t *post_desc =
(mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors);
OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock);
if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) {
break;
}
mca_btl_ugni_repost (frag);
}
return count;
return i;
}
static inline int
@ -557,7 +578,6 @@ static int mca_btl_ugni_component_progress (void)
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
ugni_module = mca_btl_ugni_component.modules + i;
mca_btl_ugni_retry_failed (ugni_module);
mca_btl_ugni_progress_wait_list (ugni_module);
count += mca_btl_ugni_progress_datagram (ugni_module);
@ -565,6 +585,8 @@ static int mca_btl_ugni_component_progress (void)
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
count += mca_btl_ugni_progress_rdma (ugni_module, 0);
/* post pending after progressing rdma */
mca_btl_ugni_post_pending (ugni_module);
}
return count;

Просмотреть файл

@ -16,7 +16,7 @@
static inline void mca_btl_ugni_base_frag_constructor (mca_btl_ugni_base_frag_t *frag)
{
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
}
static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t *frag)
@ -26,7 +26,7 @@ static inline void mca_btl_ugni_eager_frag_constructor (mca_btl_ugni_base_frag_t
mca_btl_ugni_base_frag_constructor (frag);
frag->segments[0].memory_handle = reg->memory_hdl;
frag->memory_handle = reg->handle;
}
OBJ_CLASS_INSTANCE(mca_btl_ugni_smsg_frag_t, mca_btl_base_descriptor_t,
@ -38,6 +38,9 @@ OBJ_CLASS_INSTANCE(mca_btl_ugni_rdma_frag_t, mca_btl_base_descriptor_t,
OBJ_CLASS_INSTANCE(mca_btl_ugni_eager_frag_t, mca_btl_base_descriptor_t,
mca_btl_ugni_eager_frag_constructor, NULL);
OBJ_CLASS_INSTANCE(mca_btl_ugni_post_descriptor_t, ompi_free_list_item_t,
NULL, NULL);
void mca_btl_ugni_frag_init (mca_btl_ugni_base_frag_t *frag, mca_btl_ugni_module_t *ugni_module)
{
frag->msg_id = opal_pointer_array_add (&ugni_module->pending_smsg_frags_bb, (void *) frag);

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
@ -19,13 +19,6 @@
#include "btl_ugni.h"
#include "btl_ugni_endpoint.h"
typedef struct mca_btl_ugni_segment_t {
mca_btl_base_segment_t base;
gni_mem_handle_t memory_handle;
uint8_t extra_bytes[3];
uint8_t extra_byte_count;
} mca_btl_ugni_segment_t;
typedef struct mca_btl_ugni_send_frag_hdr_t {
uint32_t lag;
} mca_btl_ugni_send_frag_hdr_t;
@ -41,7 +34,9 @@ typedef struct mca_btl_ugni_rdma_frag_hdr_t {
typedef struct mca_btl_ugni_eager_frag_hdr_t {
mca_btl_ugni_send_frag_hdr_t send;
mca_btl_ugni_segment_t src_seg;
uint32_t size;
uint64_t address;
mca_btl_base_registration_handle_t memory_handle;
void *ctx;
} mca_btl_ugni_eager_frag_hdr_t;
@ -59,28 +54,28 @@ typedef union mca_btl_ugni_frag_hdr_t {
} mca_btl_ugni_frag_hdr_t;
enum {
MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */
MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */
MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */
MCA_BTL_UGNI_FRAG_IGNORE = 8, /* ignore local smsg completion */
MCA_BTL_UGNI_FRAG_SMSG_COMPLETE = 16 /* SMSG has completed for this message */
MCA_BTL_UGNI_FRAG_BUFFERED = 1, /* frag data is buffered */
MCA_BTL_UGNI_FRAG_COMPLETE = 2, /* smsg complete for frag */
MCA_BTL_UGNI_FRAG_EAGER = 4, /* eager get frag */
MCA_BTL_UGNI_FRAG_IGNORE = 8, /* ignore local smsg completion */
MCA_BTL_UGNI_FRAG_SMSG_COMPLETE = 16, /* SMSG has completed for this message */
MCA_BTL_UGNI_FRAG_RESPONSE = 32,
};
struct mca_btl_ugni_base_frag_t;
typedef void (*frag_cb_t) (struct mca_btl_ugni_base_frag_t *, int);
typedef struct mca_btl_ugni_base_frag_t {
mca_btl_base_descriptor_t base;
uint32_t msg_id;
uint16_t hdr_size;
uint16_t flags;
mca_btl_ugni_frag_hdr_t hdr;
mca_btl_ugni_segment_t segments[2];
mca_btl_base_segment_t segments[2];
opal_common_ugni_post_desc_t post_desc;
mca_btl_base_endpoint_t *endpoint;
mca_btl_ugni_reg_t *registration;
ompi_free_list_t *my_list;
mca_btl_base_registration_handle_t memory_handle;
} mca_btl_ugni_base_frag_t;
typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t;
@ -90,6 +85,56 @@ typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_eager_frag_t;
#define MCA_BTL_UGNI_DESC_TO_FRAG(desc) \
((mca_btl_ugni_base_frag_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_base_frag_t, post_desc)))
typedef struct mca_btl_ugni_post_descriptor_t {
ompi_free_list_item_t super;
opal_common_ugni_post_desc_t desc;
mca_btl_base_endpoint_t *endpoint;
mca_btl_base_registration_handle_t *local_handle;
mca_btl_base_rdma_completion_fn_t cbfunc;
void *cbdata;
void *ctx;
} mca_btl_ugni_post_descriptor_t;
OBJ_CLASS_DECLARATION(mca_btl_ugni_post_descriptor_t);
#define MCA_BTL_UGNI_DESC_TO_PDESC(desc) \
((mca_btl_ugni_post_descriptor_t *)((uintptr_t) (desc) - offsetof (mca_btl_ugni_post_descriptor_t, desc)))
static inline void mca_btl_ugni_alloc_post_descriptor (mca_btl_base_endpoint_t *endpoint, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata,
mca_btl_ugni_post_descriptor_t **desc)
{
ompi_free_list_item_t *item = NULL;
OMPI_FREE_LIST_GET_MT(&endpoint->btl->post_descriptors, item);
*desc = (mca_btl_ugni_post_descriptor_t *) item;
if (NULL != item) {
(*desc)->cbfunc = cbfunc;
(*desc)->ctx = cbcontext;
(*desc)->cbdata = cbdata;
(*desc)->local_handle = local_handle;
(*desc)->endpoint = endpoint;
}
}
static inline void mca_btl_ugni_return_post_descriptor (mca_btl_ugni_module_t *module,
mca_btl_ugni_post_descriptor_t *desc)
{
OMPI_FREE_LIST_RETURN_MT(&module->post_descriptors, &desc->super);
}
static inline void mca_btl_ugni_post_desc_complete (mca_btl_ugni_module_t *module, mca_btl_ugni_post_descriptor_t *desc, int rc)
{
BTL_VERBOSE(("RDMA/FMA/ATOMIC operation complete for post descriptor %p. rc = %d", (void *) desc, rc));
/* call the user's callback function */
desc->cbfunc (&module->super, desc->endpoint, (void *)(intptr_t) desc->desc.base.local_addr,
desc->local_handle, desc->ctx, desc->cbdata, rc);
/* the descriptor is no longer needed */
mca_btl_ugni_return_post_descriptor (module, desc);
}
OBJ_CLASS_DECLARATION(mca_btl_ugni_smsg_frag_t);
OBJ_CLASS_DECLARATION(mca_btl_ugni_rdma_frag_t);
OBJ_CLASS_DECLARATION(mca_btl_ugni_eager_frag_t);

Просмотреть файл

@ -13,44 +13,34 @@
#include "btl_ugni_rdma.h"
#include "btl_ugni_smsg.h"
/**
* Initiate a get operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) {
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des;
mca_btl_ugni_segment_t *src_seg = (mca_btl_ugni_segment_t *) des->des_remote;
mca_btl_ugni_segment_t *dst_seg = (mca_btl_ugni_segment_t *) des->des_local;
size_t size = src_seg->base.seg_len - src_seg->extra_byte_count;
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle,
size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
bool check;
BTL_VERBOSE(("Using RDMA/FMA Get"));
/* Check if the get is aligned/sized on a multiple of 4 */
check = !!((remote_address | (uint64_t)(intptr_t) local_address | size) & (mca_btl_ugni_module.super.btl_get_alignment - 1));
if (OPAL_UNLIKELY(check || size > mca_btl_ugni_module.super.btl_get_limit)) {
BTL_VERBOSE(("RDMA/FMA Get not available due to size or alignment restrictions"));
/* notify the caller that get is not available */
return OPAL_ERR_NOT_AVAILABLE;
}
BTL_VERBOSE(("Using RDMA/FMA Get from local address %p to remote address %" PRIx64,
local_address, remote_address));
/* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
(void) mca_btl_ugni_check_endpoint_state(endpoint);
/* Check if the get is aligned/sized on a multiple of 4 */
check = !!((des->des_remote->seg_addr.lval | des->des_local->seg_addr.lval | size) & 3);
if (OPAL_UNLIKELY(check || size > mca_btl_ugni_component.ugni_get_limit)) {
/* switch to put */
return OPAL_ERR_NOT_AVAILABLE;
}
if (src_seg->extra_byte_count) {
memmove ((char *) dst_seg->base.seg_addr.pval + size, src_seg->extra_bytes, src_seg->extra_byte_count);
src_seg->base.seg_len = size;
dst_seg->base.seg_len = size;
}
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return mca_btl_ugni_post (frag, true, dst_seg, src_seg);
return mca_btl_ugni_post (endpoint, true, size, local_address, remote_address, local_handle,
remote_handle, cbfunc, cbcontext, cbdata);
}
/* eager get */
@ -60,6 +50,8 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
mca_btl_ugni_base_frag_t *pending_frag, *frag = (mca_btl_ugni_base_frag_t *) desc;
memset (&frag->hdr, 0, sizeof (frag->hdr));
OPAL_THREAD_LOCK(&ugni_module->eager_get_pending_lock);
pending_frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->eager_get_pending);
OPAL_THREAD_UNLOCK(&ugni_module->eager_get_pending_lock);
@ -68,6 +60,8 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas
/* copy the relevant data out of the pending fragment */
frag->endpoint = pending_frag->endpoint;
assert (frag != pending_frag);
/* start the next eager get using this fragment */
(void) mca_btl_ugni_start_eager_get (frag->endpoint, pending_frag->hdr.eager_ex, frag);
@ -80,19 +74,21 @@ static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_bas
}
static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *desc, int rc)
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *cbdata, int status)
{
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) desc;
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) context;
uint32_t len = frag->hdr.eager.send.lag & 0x00ffffff;
uint8_t tag = frag->hdr.eager.send.lag >> 24;
size_t payload_len = frag->hdr.eager.src_seg.base.seg_len;
size_t payload_len = frag->hdr.eager.size;
size_t hdr_len = len - payload_len;
mca_btl_active_message_callback_t *reg;
mca_btl_base_segment_t segs[2];
mca_btl_ugni_base_frag_t tmp;
int rc;
BTL_VERBOSE(("eager get for rem_ctx %p complete", frag->hdr.eager.ctx));
BTL_VERBOSE(("eager get for rem_ctx %p complete", frag->hdr.eager.ctx))
tmp.base.des_local = segs;
if (hdr_len) {
@ -100,19 +96,21 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl,
segs[0].seg_addr.pval = frag->hdr.eager_ex.pml_header;
segs[0].seg_len = hdr_len;
segs[1].seg_addr.pval = frag->segments[0].base.seg_addr.pval;
segs[1].seg_addr.pval = local_address;
segs[1].seg_len = payload_len;
} else {
tmp.base.des_local_count = 1;
segs[0].seg_addr.pval = frag->segments[0].base.seg_addr.pval;
segs[0].seg_addr.pval = local_address;
segs[0].seg_len = payload_len;
}
reg = mca_btl_base_active_message_trigger + tag;
reg->cbfunc(&frag->endpoint->btl->super, tag, &(tmp.base), reg->cbdata);
/* fill in the response header */
frag->hdr.rdma.ctx = frag->hdr.eager.ctx;
frag->flags = MCA_BTL_UGNI_FRAG_RESPONSE;
/* once complete use this fragment for a pending eager get if any exist */
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get_progress_pending;
@ -122,6 +120,7 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl,
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
if (OPAL_UNLIKELY(0 > rc)) {
/* queue fragment */
OPAL_THREAD_LOCK(&endpoint->lock);
if (false == endpoint->wait_listed) {
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
@ -129,50 +128,50 @@ static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl,
endpoint->wait_listed = true;
}
OPAL_THREAD_LOCK(&endpoint->lock);
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&endpoint->lock);
}
}
int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *endpoint,
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
mca_btl_ugni_base_frag_t *frag)
{
mca_btl_ugni_module_t *ugni_module = ep->btl;
mca_btl_ugni_module_t *ugni_module = endpoint->btl;
size_t size;
int rc;
BTL_VERBOSE(("starting eager get for remote ctx: %p", hdr.eager.ctx));
do {
if (NULL == frag) {
rc = MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(ep, frag);
/* try to allocate a registered buffer */
rc = MCA_BTL_UGNI_FRAG_ALLOC_EAGER_RECV(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag);
/* no registered buffers available. try again later */
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(endpoint, frag);
/* not much can be done if a small fragment can not be allocated. abort! */
assert (NULL != frag);
frag->hdr.eager_ex = hdr;
break;
}
}
frag->hdr.eager_ex = hdr;
frag->flags = 0;
frag->base.des_flags = 0;
frag->hdr.eager_ex = hdr;
frag->segments[1] = hdr.eager.src_seg;
/* increase size to a multiple of 4 bytes (required for get) */
frag->segments[0].base.seg_len = frag->segments[1].base.seg_len =
(hdr.eager.src_seg.base.seg_len + 3) & ~3;
frag->base.des_local = &frag->segments[1].base;
/* increase size to a multiple of 4 bytes (required for get on Gemini) */
size = (hdr.eager.size + 3) & ~3;
/* set up callback for get completion */
frag->base.des_flags = MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get;
rc = mca_btl_ugni_post (frag, GNI_POST_RDMA_GET, frag->segments, frag->segments + 1);
/* start the get */
rc = mca_btl_ugni_post (endpoint, true, size, frag->base.super.ptr, hdr.eager.address,
&frag->memory_handle, &hdr.eager.memory_handle,
mca_btl_ugni_callback_eager_get, frag, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS == rc)) {
return OPAL_SUCCESS;
}

Просмотреть файл

@ -27,13 +27,6 @@ mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
static int
mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl);
static mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags);
static struct mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
@ -42,20 +35,27 @@ mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags);
static mca_btl_base_registration_handle_t *
mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
size_t size, uint32_t flags);
static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
mca_btl_ugni_module_t mca_btl_ugni_module = {
.super = {
.btl_component = &mca_btl_ugni_component.super,
.btl_add_procs = mca_btl_ugni_add_procs,
.btl_del_procs = mca_btl_ugni_del_procs,
.btl_finalize = mca_btl_ugni_module_finalize,
.btl_alloc = mca_btl_ugni_alloc,
.btl_free = mca_btl_ugni_free,
.btl_prepare_src = mca_btl_ugni_prepare_src,
.btl_prepare_dst = mca_btl_ugni_prepare_dst,
.btl_send = mca_btl_ugni_send,
.btl_sendi = mca_btl_ugni_sendi,
.btl_put = mca_btl_ugni_put,
.btl_get = mca_btl_ugni_get,
.btl_component = &mca_btl_ugni_component.super,
.btl_add_procs = mca_btl_ugni_add_procs,
.btl_del_procs = mca_btl_ugni_del_procs,
.btl_finalize = mca_btl_ugni_module_finalize,
.btl_alloc = mca_btl_ugni_alloc,
.btl_free = mca_btl_ugni_free,
.btl_prepare_src = mca_btl_ugni_prepare_src,
.btl_send = mca_btl_ugni_send,
.btl_sendi = mca_btl_ugni_sendi,
.btl_put = mca_btl_ugni_put,
.btl_get = mca_btl_ugni_get,
.btl_register_mem = mca_btl_ugni_register_mem,
.btl_deregister_mem = mca_btl_ugni_deregister_mem,
}
};
@ -92,6 +92,9 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, ompi_free_list_t);
OBJ_CONSTRUCT(&ugni_module->pending_descriptors, opal_list_t);
OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
OBJ_CONSTRUCT(&ugni_module->post_descriptors, ompi_free_list_t);
ugni_module->device = dev;
dev->btl_ctx = (void *) ugni_module;
@ -188,7 +191,6 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb);
OBJ_DESTRUCT(&ugni_module->id_to_endpoint);
OBJ_DESTRUCT(&ugni_module->endpoints);
OBJ_DESTRUCT(&ugni_module->failed_frags);
OBJ_DESTRUCT(&ugni_module->eager_get_pending);
OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock);
@ -234,13 +236,13 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
frag->base.des_flags = flags;
frag->base.order = order;
frag->base.des_local = &frag->segments[1].base;
frag->base.des_local = &frag->segments[1];
frag->base.des_local_count = 1;
frag->segments[0].base.seg_addr.pval = NULL;
frag->segments[0].base.seg_len = 0;
frag->segments[1].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[1].base.seg_len = size;
frag->segments[0].seg_addr.pval = NULL;
frag->segments[0].seg_len = 0;
frag->segments[1].seg_addr.pval = frag->base.super.ptr;
frag->segments[1].seg_len = size;
frag->flags = MCA_BTL_UGNI_FRAG_BUFFERED;
if (size > mca_btl_ugni_component.smsg_max_data) {
@ -251,7 +253,7 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
frag->segments[1].memory_handle = registration->memory_hdl;
frag->hdr.eager.memory_handle = registration->handle;
} else {
frag->hdr_size = sizeof (frag->hdr.send);
}
@ -274,54 +276,32 @@ mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
if (OPAL_LIKELY(reserve)) {
return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
order, reserve, size, flags);
} else {
return mca_btl_ugni_prepare_src_rdma (btl, endpoint, registration,
convertor, order, size, flags);
}
return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
order, reserve, size, flags);
}
static mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_dst (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags)
static mca_btl_base_registration_handle_t *
mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
size_t size, uint32_t flags)
{
mca_btl_ugni_base_frag_t *frag;
void *data_ptr;
mca_btl_ugni_reg_t *reg;
int rc;
opal_convertor_get_current_pointer (convertor, &data_ptr);
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0,
(mca_mpool_base_registration_t **) &reg);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
return NULL;
}
/* always need to register the buffer for put/get (even for fma) */
if (NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
data_ptr, *size, 0,
&registration);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
mca_btl_ugni_frag_return (frag);
return NULL;
}
frag->registration = (mca_btl_ugni_reg_t*) registration;
}
frag->segments[0].memory_handle = ((mca_btl_ugni_reg_t *)registration)->memory_hdl;
frag->segments[0].base.seg_len = *size;
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
frag->base.des_local = &frag->segments->base;
frag->base.des_local_count = 1;
frag->base.order = order;
frag->base.des_flags = flags;
return (struct mca_btl_base_descriptor_t *) frag;
return &reg->handle;
}
static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
{
mca_btl_ugni_reg_t *reg =
(mca_btl_ugni_reg_t *)((intptr_t) handle - offsetof (mca_btl_ugni_reg_t, handle));
(void) btl->btl_mpool->mpool_deregister (btl->btl_mpool, &reg->base);
return OPAL_SUCCESS;
}

Просмотреть файл

@ -35,13 +35,13 @@ mca_btl_ugni_prepare_src_send_nodata (struct mca_btl_base_module_t *btl,
frag->hdr_size = reserve + sizeof (frag->hdr.send);
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
frag->segments[0].base.seg_len = reserve;
frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header;
frag->segments[0].seg_len = reserve;
frag->segments[1].base.seg_addr.pval = NULL;
frag->segments[1].base.seg_len = 0;
frag->segments[1].seg_addr.pval = NULL;
frag->segments[1].seg_len = 0;
frag->base.des_local = &frag->segments->base;
frag->base.des_local = &frag->segments;
frag->base.des_local_count = 1;
frag->base.order = order;
frag->base.des_flags = flags;
@ -84,21 +84,21 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl,
frag->flags = MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
frag->registration = registration;
frag->segments[1].memory_handle = registration->memory_hdl;
frag->hdr.eager.memory_handle = registration->handle;;
frag->hdr_size = reserve + sizeof (frag->hdr.eager);
frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header;
frag->segments[0].seg_addr.pval = frag->hdr.eager_ex.pml_header;
} else {
frag->hdr_size = reserve + sizeof (frag->hdr.send);
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header;
}
frag->segments[0].base.seg_len = reserve;
frag->segments[0].seg_len = reserve;
frag->segments[1].base.seg_addr.pval = data_ptr;
frag->segments[1].base.seg_len = *size;
frag->segments[1].seg_addr.pval = data_ptr;
frag->segments[1].seg_len = *size;
frag->base.des_local = &frag->segments->base;
frag->base.des_local = &frag->segments;
frag->base.des_local_count = 2;
frag->base.order = order;
frag->base.des_flags = flags;
@ -130,10 +130,9 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
frag->segments[1].memory_handle = registration->memory_hdl;
frag->hdr.eager.memory_handle = registration->handle;
frag->hdr_size = reserve + sizeof (frag->hdr.eager);
frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header;
frag->segments[0].seg_addr.pval = frag->hdr.eager_ex.pml_header;
} else {
(void) MCA_BTL_UGNI_FRAG_ALLOC_SMSG(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
@ -141,7 +140,7 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
}
frag->hdr_size = reserve + sizeof (frag->hdr.send);
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
frag->segments[0].seg_addr.pval = frag->hdr.send_ex.pml_header;
}
frag->flags |= MCA_BTL_UGNI_FRAG_BUFFERED;
@ -155,12 +154,12 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
return NULL;
}
frag->segments[0].base.seg_len = reserve;
frag->segments[0].seg_len = reserve;
frag->segments[1].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[1].base.seg_len = *size;
frag->segments[1].seg_addr.pval = frag->base.super.ptr;
frag->segments[1].seg_len = *size;
frag->base.des_local = &frag->segments->base;
frag->base.des_local = &frag->segments;
frag->base.des_local_count = 2;
frag->base.order = order;
frag->base.des_flags = flags;
@ -197,66 +196,4 @@ mca_btl_ugni_prepare_src_send (struct mca_btl_base_module_t *btl,
}
}
static inline struct mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_src_rdma (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t *size,
uint32_t flags)
{
mca_btl_ugni_base_frag_t *frag;
void *data_ptr;
int rc;
opal_convertor_get_current_pointer (convertor, &data_ptr);
(void) MCA_BTL_UGNI_FRAG_ALLOC_RDMA(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
/*
* For medium message use FMA protocols and for large message
* use BTE protocols
*/
/* No need to register while using FMA Put (registration is
* non-null in get-- is this always true?) */
if (*size >= mca_btl_ugni_component.ugni_fma_limit || (flags & MCA_BTL_DES_FLAGS_GET)) {
if (NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, *size, 0,
(mca_mpool_base_registration_t **) &registration);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
mca_btl_ugni_frag_return (frag);
return NULL;
}
frag->registration = (mca_btl_ugni_reg_t *) registration;
}
frag->segments[0].memory_handle = ((mca_btl_ugni_reg_t *)registration)->memory_hdl;
} else {
memset ((void *) &frag->segments[0].memory_handle, 0,
sizeof (frag->segments[0].memory_handle));
}
if ((flags & MCA_BTL_DES_FLAGS_GET) && (*size & 0x3)) {
memmove (frag->segments[0].extra_bytes, (char *) data_ptr + (*size & ~0x3),
*size & 0x3);
frag->segments[0].extra_byte_count = *size & 0x3;
} else {
frag->segments[0].extra_byte_count = 0;
}
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
frag->segments[0].base.seg_len = *size;
frag->base.des_local = &frag->segments->base;
frag->base.des_local_count = 1;
frag->base.order = order;
frag->base.des_flags = flags;
return &frag->base;
}
#endif

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -14,25 +14,20 @@
#include "btl_ugni_rdma.h"
/**
* Initiate a put operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) {
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des;
BTL_VERBOSE(("Using RDMA/FMA Put for frag %p", (void *) des));
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle,
size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
BTL_VERBOSE(("Using RDMA/FMA Put from local address %p to remote address %" PRIx64,
local_address, remote_address));
/* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
(void) mca_btl_ugni_check_endpoint_state(endpoint);
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return mca_btl_ugni_post (frag, false, (mca_btl_ugni_segment_t *) des->des_local,
(mca_btl_ugni_segment_t *) des->des_remote);
return mca_btl_ugni_post (endpoint, false, size, local_address, remote_address, local_handle,
remote_handle, cbfunc, cbcontext, cbdata);
}

Просмотреть файл

@ -20,103 +20,144 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
mca_btl_ugni_base_frag_t *frag);
static inline void init_gni_post_desc (mca_btl_ugni_base_frag_t *frag,
gni_post_type_t op_type,
uint64_t lcl_addr,
gni_mem_handle_t lcl_mdh,
uint64_t rem_addr,
gni_mem_handle_t rem_mdh,
uint64_t bufsize,
gni_cq_handle_t cq_hndl) {
frag->post_desc.base.type = op_type;
frag->post_desc.base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
frag->post_desc.base.dlvr_mode = GNI_DLVMODE_PERFORMANCE;
frag->post_desc.base.local_addr = (uint64_t) lcl_addr;
frag->post_desc.base.local_mem_hndl = lcl_mdh;
frag->post_desc.base.remote_addr = (uint64_t) rem_addr;
frag->post_desc.base.remote_mem_hndl = rem_mdh;
frag->post_desc.base.length = bufsize;
#if 0
frag->post_desc.base.rdma_mode = GNI_RDMAMODE_FENCE;
#endif
frag->post_desc.base.rdma_mode = 0;
frag->post_desc.base.src_cq_hndl = cq_hndl;
frag->post_desc.tries = 0;
static inline void init_gni_post_desc (opal_common_ugni_post_desc_t *post_desc,
gni_post_type_t op_type,
uint64_t lcl_addr,
gni_mem_handle_t lcl_mdh,
uint64_t rem_addr,
gni_mem_handle_t rem_mdh,
uint64_t bufsize,
gni_cq_handle_t cq_hndl) {
post_desc->base.type = op_type;
post_desc->base.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
post_desc->base.dlvr_mode = GNI_DLVMODE_PERFORMANCE;
post_desc->base.local_addr = (uint64_t) lcl_addr;
post_desc->base.local_mem_hndl = lcl_mdh;
post_desc->base.remote_addr = (uint64_t) rem_addr;
post_desc->base.remote_mem_hndl = rem_mdh;
post_desc->base.length = bufsize;
post_desc->base.rdma_mode = 0;
post_desc->base.src_cq_hndl = cq_hndl;
post_desc->tries = 0;
}
static inline int mca_btl_ugni_post_fma (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type,
mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg)
static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type,
size_t size, void *local_address, uint64_t remote_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
gni_return_t rc;
mca_btl_ugni_post_descriptor_t *post_desc;
gni_return_t grc;
/* Post descriptor (CQ is ignored for FMA transactions) */
init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle,
rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len, 0);
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
rc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
if (GNI_RC_SUCCESS != rc) {
BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", rc));
/* Post descriptor (CQ is ignored for FMA transactions) -- The CQ associated with the endpoint
* is used. */
init_gni_post_desc (&post_desc->desc, op_type, (intptr_t) local_address, local_handle->gni_handle,
remote_address, remote_handle->gni_handle, size, 0);
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
grc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
if (GNI_RC_ALIGNMENT_ERROR == grc) {
BTL_VERBOSE(("GNI_PostFma failed with an alignment error"));
return OPAL_ERR_NOT_AVAILABLE;
}
BTL_VERBOSE(("GNI_PostFma failed with gni rc: %d", grc));
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}
static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_post_type_t op_type,
mca_btl_ugni_segment_t *lcl_seg, mca_btl_ugni_segment_t *rem_seg)
static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_post_type_t op_type,
size_t size, void *local_address, uint64_t remote_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
gni_return_t status;
mca_btl_ugni_post_descriptor_t *post_desc;
gni_return_t grc;
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
if (OPAL_UNLIKELY(NULL == post_desc)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* Post descriptor */
init_gni_post_desc (frag, op_type, lcl_seg->base.seg_addr.lval, lcl_seg->memory_handle,
rem_seg->base.seg_addr.lval, rem_seg->memory_handle, lcl_seg->base.seg_len,
frag->endpoint->btl->rdma_local_cq);
init_gni_post_desc (&post_desc->desc, op_type, (intptr_t) local_address, local_handle->gni_handle,
remote_address, remote_handle->gni_handle, size, endpoint->btl->rdma_local_cq);
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
status = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
if (GNI_RC_SUCCESS != status) {
BTL_VERBOSE(("GNI_PostRdma failed with gni rc: %d", status));
return opal_common_rc_ugni_to_opal(status);
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
grc = GNI_PostRdma (endpoint->rdma_ep_handle, &post_desc->desc.base);
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
if (GNI_RC_ALIGNMENT_ERROR == grc) {
BTL_VERBOSE(("GNI_PostRdma failed with an alignment error"));
return OPAL_ERR_NOT_AVAILABLE;
}
BTL_VERBOSE(("GNI_PostRdma failed with gni rc: %d", grc));
return OPAL_ERR_OUT_OF_RESOURCE;
}
return OPAL_SUCCESS;
}
static inline int mca_btl_ugni_post (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_ugni_segment_t *lcl_seg,
mca_btl_ugni_segment_t *rem_seg) {
static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, size_t size,
void *local_address, uint64_t remote_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
const gni_post_type_t fma_ops[2] = {GNI_POST_FMA_PUT, GNI_POST_FMA_GET};
const gni_post_type_t rdma_ops[2] = {GNI_POST_RDMA_PUT, GNI_POST_RDMA_GET};
if (frag->base.des_local->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
return mca_btl_ugni_post_fma (frag, fma_ops[get], lcl_seg, rem_seg);
if (size <= mca_btl_ugni_component.ugni_fma_limit) {
return mca_btl_ugni_post_fma (endpoint, fma_ops[get], size, local_address, remote_address,
local_handle, remote_handle, cbfunc, cbcontext, cbdata);
}
return mca_btl_ugni_post_bte (frag, rdma_ops[get], lcl_seg, rem_seg);
return mca_btl_ugni_post_bte (endpoint, rdma_ops[get], size, local_address, remote_address,
local_handle, remote_handle, cbfunc, cbcontext, cbdata);
}
static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag) {
static inline int mca_btl_ugni_repost (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_post_descriptor_t *post_desc)
{
gni_return_t grc;
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
if (GNI_POST_RDMA_PUT == frag->post_desc.base.type ||
GNI_POST_RDMA_GET == frag->post_desc.base.type) {
grc = GNI_PostRdma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
if (GNI_POST_RDMA_PUT == post_desc->desc.base.type ||
GNI_POST_RDMA_GET == post_desc->desc.base.type) {
grc = GNI_PostRdma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base);
} else {
grc = GNI_PostFma (frag->endpoint->rdma_ep_handle, &frag->post_desc.base);
grc = GNI_PostFma (post_desc->endpoint->rdma_ep_handle, &post_desc->desc.base);
}
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
/* NTH: Should we even retry these? When this code was written there was no indication
* whether an error in post is recoverable. Clobber this code and the associated data
* structures if post errors are not recoverable. */
OPAL_THREAD_LOCK(&frag->endpoint->btl->failed_frags_lock);
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&frag->endpoint->btl->failed_frags_lock);
OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock);
opal_list_append (&ugni_module->pending_descriptors, (opal_list_item_t *) post_desc);
OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock);
}
return opal_common_rc_ugni_to_opal (grc);
}
#endif /* MCA_BTL_UGNI_RDMA_H */

Просмотреть файл

@ -23,7 +23,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
mca_btl_base_tag_t tag)
{
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor;
size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len;
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
int flags_save = frag->base.des_flags;
int rc;
@ -41,7 +41,7 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
}
BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor,
OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, frag->segments[0].base.seg_len));
opal_process_name_vpid(OPAL_PROC_MY_NAME), endpoint->common->ep_rem_id, size));
/* temporarily disable ownership and callback flags so we can reliably check the complete flag */
frag->base.des_flags &= ~(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
@ -90,15 +90,15 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
return rc;
}
int
mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
void *header, size_t header_size,
size_t payload_size, uint8_t order,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor)
int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
void *header, size_t header_size,
size_t payload_size, uint8_t order,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor)
{
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
size_t total_size = header_size + payload_size;
mca_btl_ugni_base_frag_t *frag = NULL;
size_t packed_size = payload_size;
@ -118,13 +118,14 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
frag = (mca_btl_ugni_base_frag_t *) mca_btl_ugni_prepare_src_send_buffered (btl, endpoint, convertor, order,
header_size, &packed_size, flags);
}
assert (packed_size == payload_size);
if (OPAL_UNLIKELY(NULL == frag)) {
break;
}
frag->hdr.send.lag = (tag << 24) | total_size;
memcpy (frag->segments[0].base.seg_addr.pval, header, header_size);
memcpy (frag->segments[0].seg_addr.pval, header, header_size);
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
@ -151,7 +152,13 @@ int mca_btl_ugni_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint)
if (NULL == frag) {
break;
}
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_RESPONSE))) {
rc = mca_btl_ugni_send_frag (endpoint, frag);
} else {
rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
}
if (OPAL_UNLIKELY(OPAL_SUCCESS > rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
OPAL_THREAD_LOCK(&endpoint->lock);

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -26,7 +26,7 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
mbox->attr.smsg_attr.mbox_offset = (uintptr_t) mbox->super.ptr - (uintptr_t) base_reg->base;
mbox->attr.smsg_attr.msg_buffer = base_reg->base;
mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size;
mbox->attr.smsg_attr.mem_hndl = ugni_reg->memory_hdl;
mbox->attr.smsg_attr.mem_hndl = ugni_reg->handle.gni_handle;
#if 0
fprintf(stderr,"ugni_reg->memory_hdl 0x%lx 0x%lx\n",
ugni_reg->memory_hdl.qword1,ugni_reg->memory_hdl.qword2);

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -118,12 +118,13 @@ static inline int mca_btl_ugni_send_frag (struct mca_btl_base_endpoint_t *btl_pe
mca_btl_ugni_base_frag_t *frag) {
if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_EAGER))) {
return opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.send, frag->hdr_size,
frag->segments[1].base.seg_addr.pval,
frag->segments[1].base.seg_len,
frag->segments[1].seg_addr.pval,
frag->segments[1].seg_len,
MCA_BTL_UGNI_TAG_SEND);
}
frag->hdr.eager.src_seg = frag->segments[1];
frag->hdr.eager.size = frag->segments[1].seg_len;
frag->hdr.eager.address = frag->segments[1].seg_addr.lval;
frag->hdr.eager.ctx = (void *) frag;
return opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.eager, frag->hdr_size,

Просмотреть файл

@ -33,31 +33,30 @@
* @param descriptor (IN) Description of the data to be transferred
*/
#if OPAL_BTL_VADER_HAVE_XPMEM
int mca_btl_vader_get_xpmem (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
int mca_btl_vader_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des;
mca_btl_base_segment_t *src = des->des_remote;
mca_btl_base_segment_t *dst = des->des_local;
const size_t size = min(dst->seg_len, src->seg_len);
mca_mpool_base_registration_t *reg;
void *rem_ptr;
reg = vader_get_registation (endpoint, src->seg_addr.pval, src->seg_len, 0, &rem_ptr);
/* silence warning about unused arguments */
(void) local_handle;
(void) remote_handle;
reg = vader_get_registation (endpoint, (void *)(intptr_t) remote_address, size, 0, &rem_ptr);
if (OPAL_UNLIKELY(NULL == rem_ptr)) {
return OPAL_ERROR;
}
vader_memmove (dst->seg_addr.pval, rem_ptr, size);
vader_memmove (local_address, rem_ptr, size);
vader_return_registration (reg, endpoint);
/* always call the callback function */
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->endpoint = endpoint;
mca_btl_vader_frag_complete (frag);
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS;
}
@ -68,12 +67,8 @@ int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des;
mca_btl_base_segment_t *src = des->des_remote;
mca_btl_base_segment_t *dst = des->des_local;
const size_t size = min(dst->seg_len, src->seg_len);
struct iovec src_iov = {.iov_base = src->seg_addr.pval, .iov_len = size};
struct iovec dst_iov = {.iov_base = dst->seg_addr.pval, .iov_len = size};
struct iovec src_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size};
struct iovec dst_iov = {.iov_base = local_address, .iov_len = size};
ssize_t ret;
ret = process_vm_readv (endpoint->segment_data.other.seg_ds->seg_cpid, &dst_iov, 1, &src_iov, 1, 0);

Просмотреть файл

@ -56,16 +56,6 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (
uint32_t flags
);
static struct mca_btl_base_descriptor_t *vader_prepare_dst (
struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order,
size_t reserve,
size_t *size,
uint32_t flags);
static int vader_add_procs(struct mca_btl_base_module_t* btl,
size_t nprocs, struct opal_proc_t **procs,
struct mca_btl_base_endpoint_t** peers,
@ -82,7 +72,6 @@ mca_btl_vader_t mca_btl_vader = {
.btl_alloc = mca_btl_vader_alloc,
.btl_free = vader_free,
.btl_prepare_src = vader_prepare_src,
.btl_prepare_dst = vader_prepare_dst,
.btl_send = mca_btl_vader_send,
.btl_sendi = mca_btl_vader_sendi,
.btl_dump = mca_btl_base_dump,
@ -440,60 +429,6 @@ static int vader_free (struct mca_btl_base_module_t *btl, mca_btl_base_descripto
return OPAL_SUCCESS;
}
struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
mca_btl_vader_frag_t *frag;
void *data_ptr;
if (MCA_BTL_VADER_NONE != mca_btl_vader_component.single_copy_mechanism) {
(void) MCA_BTL_VADER_FRAG_ALLOC_RDMA(frag, endpoint);
} else {
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint);
}
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
opal_convertor_get_current_pointer (convertor, &data_ptr);
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
frag->segments[0].base.seg_len = *size;
#if OPAL_BTL_VADER_HAVE_KNEM
if (MCA_BTL_VADER_KNEM == mca_btl_vader_component.single_copy_mechanism) {
struct knem_cmd_create_region knem_cr;
struct knem_cmd_param_iovec knem_iov;
knem_iov.base = (uintptr_t) data_ptr;
knem_iov.len = *size;
knem_cr.iovec_array = (uintptr_t) &knem_iov;
knem_cr.iovec_nr = 1;
knem_cr.protection = PROT_WRITE;
/* Vader will explicitly destroy this cookie */
knem_cr.flags = 0;
if (OPAL_UNLIKELY(ioctl(mca_btl_vader.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) {
MCA_BTL_VADER_FRAG_RETURN(frag);
return NULL;
}
frag->segments[0].cookie = knem_cr.cookie;
frag->segments[0].registered_base = (intptr_t) data_ptr;
frag->cookie = knem_cr.cookie;
}
#endif /* OPAL_BTL_SM_HAVE_KNEM */
frag->base.order = order;
frag->base.des_flags = flags;
return &frag->base;
}
/**
* Pack data
*

Просмотреть файл

@ -35,31 +35,26 @@
* @param descriptor (IN) Description of the data to be transferred
*/
#if OPAL_BTL_VADER_HAVE_XPMEM
int mca_btl_vader_put_xpmem (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
int mca_btl_vader_put (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
struct mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des;
mca_btl_base_segment_t *src = des->des_local;
mca_btl_base_segment_t *dst = des->des_remote;
const size_t size = min(dst->seg_len, src->seg_len);
mca_mpool_base_registration_t *reg;
void *rem_ptr;
reg = vader_get_registation (endpoint, dst->seg_addr.pval, dst->seg_len, 0, &rem_ptr);
reg = vader_get_registation (endpoint, (void *)(intptr_t) remote_address, size, 0, &rem_ptr);
if (OPAL_UNLIKELY(NULL == reg)) {
return OPAL_ERROR;
}
vader_memmove (rem_ptr, src->seg_addr.pval, size);
vader_memmove (rem_ptr, local_address, size);
vader_return_registration (reg, endpoint);
/* always call the callback function */
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->endpoint = endpoint;
mca_btl_vader_frag_complete (frag);
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
return OPAL_SUCCESS;
}
@ -70,12 +65,8 @@ int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des;
mca_btl_base_segment_t *src = des->des_local;
mca_btl_base_segment_t *dst = des->des_remote;
const size_t size = min(dst->seg_len, src->seg_len);
struct iovec src_iov = {.iov_base = src->seg_addr.pval, .iov_len = size};
struct iovec dst_iov = {.iov_base = dst->seg_addr.pval, .iov_len = size};
struct iovec src_iov = {.iov_base = local_address, .iov_len = size};
struct iovec dst_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size};
ssize_t ret;
ret = process_vm_writev (endpoint->segment_data.other.seg_ds->seg_cpid, &src_iov, 1, &dst_iov, 1, 0);