Revert "Merge pull request #275 from hjelmn/btlmod"
This reverts commit ccaecf0fd6c862877e6a1e2643f95fa956c87769, reversing changes made to 6a19bf85dde5306f559f09952cf3919d97f52502.
Этот коммит содержится в:
родитель
0d413fb73f
Коммит
1b564f62bd
@ -152,7 +152,7 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[
|
||||
|
||||
# If we have the openib stuff available, find out what we've got
|
||||
AS_IF([test "$ompi_check_openib_happy" = "yes"],
|
||||
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, IBV_NODE_USNIC, IBV_ATOMIC_HCA], [], [],
|
||||
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, IBV_NODE_USNIC], [], [],
|
||||
[#include <infiniband/verbs.h>])
|
||||
AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq])
|
||||
|
||||
|
@ -47,10 +47,6 @@ static ompi_errcode_intern_t ompi_err_request;
|
||||
static ompi_errcode_intern_t ompi_err_buffer;
|
||||
static ompi_errcode_intern_t ompi_err_rma_sync;
|
||||
static ompi_errcode_intern_t ompi_err_rma_shared;
|
||||
static ompi_errcode_intern_t ompi_err_rma_attach;
|
||||
static ompi_errcode_intern_t ompi_err_rma_range;
|
||||
static ompi_errcode_intern_t ompi_err_rma_conflict;
|
||||
static ompi_errcode_intern_t ompi_err_win;
|
||||
|
||||
static void ompi_errcode_intern_construct(ompi_errcode_intern_t* errcode);
|
||||
static void ompi_errcode_intern_destruct(ompi_errcode_intern_t* errcode);
|
||||
@ -214,38 +210,6 @@ int ompi_errcode_intern_init (void)
|
||||
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_shared.index,
|
||||
&ompi_err_rma_shared);
|
||||
|
||||
OBJ_CONSTRUCT(&ompi_err_rma_attach, ompi_errcode_intern_t);
|
||||
ompi_err_rma_attach.code = OMPI_ERR_RMA_ATTACH;
|
||||
ompi_err_rma_attach.mpi_code = MPI_ERR_RMA_ATTACH;
|
||||
ompi_err_rma_attach.index = pos++;
|
||||
strncpy(ompi_err_rma_attach.errstring, "OMPI_ERR_RMA_ATTACH", OMPI_MAX_ERROR_STRING);
|
||||
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_attach.index,
|
||||
&ompi_err_rma_attach);
|
||||
|
||||
OBJ_CONSTRUCT(&ompi_err_rma_range, ompi_errcode_intern_t);
|
||||
ompi_err_rma_range.code = OMPI_ERR_RMA_RANGE;
|
||||
ompi_err_rma_range.mpi_code = MPI_ERR_RMA_RANGE;
|
||||
ompi_err_rma_range.index = pos++;
|
||||
strncpy(ompi_err_rma_range.errstring, "OMPI_ERR_RMA_RANGE", OMPI_MAX_ERROR_STRING);
|
||||
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_range.index,
|
||||
&ompi_err_rma_range);
|
||||
|
||||
OBJ_CONSTRUCT(&ompi_err_rma_conflict, ompi_errcode_intern_t);
|
||||
ompi_err_rma_conflict.code = OMPI_ERR_RMA_CONFLICT;
|
||||
ompi_err_rma_conflict.mpi_code = MPI_ERR_RMA_CONFLICT;
|
||||
ompi_err_rma_conflict.index = pos++;
|
||||
strncpy(ompi_err_rma_conflict.errstring, "OMPI_ERR_RMA_CONFLICT", OMPI_MAX_ERROR_STRING);
|
||||
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_conflict.index,
|
||||
&ompi_err_rma_conflict);
|
||||
|
||||
OBJ_CONSTRUCT(&ompi_err_win, ompi_errcode_intern_t);
|
||||
ompi_err_win.code = OMPI_ERR_WIN;
|
||||
ompi_err_win.mpi_code = MPI_ERR_WIN;
|
||||
ompi_err_win.index = pos++;
|
||||
strncpy(ompi_err_win.errstring, "OMPI_ERR_WIN", OMPI_MAX_ERROR_STRING);
|
||||
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_win.index,
|
||||
&ompi_err_win);
|
||||
|
||||
ompi_errcode_intern_lastused=pos;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -271,10 +235,6 @@ int ompi_errcode_intern_finalize(void)
|
||||
OBJ_DESTRUCT(&ompi_err_request);
|
||||
OBJ_DESTRUCT(&ompi_err_rma_sync);
|
||||
OBJ_DESTRUCT(&ompi_err_rma_shared);
|
||||
OBJ_DESTRUCT(&ompi_err_rma_attach);
|
||||
OBJ_DESTRUCT(&ompi_err_rma_range);
|
||||
OBJ_DESTRUCT(&ompi_err_rma_conflict);
|
||||
OBJ_DESTRUCT(&ompi_err_win);
|
||||
|
||||
OBJ_DESTRUCT(&ompi_errcodes_intern);
|
||||
return OMPI_SUCCESS;
|
||||
|
@ -66,11 +66,7 @@ enum {
|
||||
|
||||
OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1,
|
||||
OMPI_ERR_RMA_SYNC = OMPI_ERR_BASE - 2,
|
||||
OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3,
|
||||
OMPI_ERR_RMA_ATTACH = OMPI_ERR_BASE - 4,
|
||||
OMPI_ERR_RMA_RANGE = OMPI_ERR_BASE - 5,
|
||||
OMPI_ERR_RMA_CONFLICT = OMPI_ERR_BASE - 6,
|
||||
OMPI_ERR_WIN = OMPI_ERR_BASE - 7,
|
||||
OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3
|
||||
};
|
||||
|
||||
#define OMPI_ERR_MAX (OMPI_ERR_BASE - 100)
|
||||
|
@ -91,7 +91,7 @@ static void mca_bml_base_completion(
|
||||
{
|
||||
mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) des->des_cbdata;
|
||||
/* restore original state */
|
||||
((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
des->des_cbdata = ctx->cbdata;
|
||||
des->des_cbfunc = ctx->cbfunc;
|
||||
free(ctx);
|
||||
@ -121,11 +121,11 @@ int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
|
||||
malloc(sizeof(mca_bml_base_context_t));
|
||||
if(NULL != ctx) {
|
||||
opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__);
|
||||
ctx->index = (size_t) ((des->des_segments[0].seg_len *
|
||||
ctx->index = (size_t) ((des->des_local[0].seg_len *
|
||||
opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0));
|
||||
ctx->cbfunc = des->des_cbfunc;
|
||||
ctx->cbdata = des->des_cbdata;
|
||||
((unsigned char*)des->des_segments[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
((unsigned char*)des->des_local[0].seg_addr.pval)[ctx->index] ^= ~0;
|
||||
des->des_cbdata = ctx;
|
||||
des->des_cbfunc = mca_bml_base_completion;
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +10,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -308,30 +307,27 @@ static inline int mca_bml_base_sendi( mca_bml_base_btl_t* bml_btl,
|
||||
payload_size, order, flags, tag, descriptor);
|
||||
}
|
||||
|
||||
static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
|
||||
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
|
||||
static inline int mca_bml_base_put( mca_bml_base_btl_t* bml_btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
return btl->btl_put( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
|
||||
des->des_context = (void*) bml_btl;
|
||||
return btl->btl_put( btl, bml_btl->btl_endpoint, des );
|
||||
}
|
||||
|
||||
static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size,
|
||||
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbdata)
|
||||
static inline int mca_bml_base_get( mca_bml_base_btl_t* bml_btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
return btl->btl_get( btl, bml_btl->btl_endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, (void *) bml_btl, cbdata);
|
||||
des->des_context = (void*) bml_btl;
|
||||
return btl->btl_get( btl, bml_btl->btl_endpoint, des );
|
||||
}
|
||||
|
||||
|
||||
static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
|
||||
mca_mpool_base_registration_t* reg,
|
||||
struct opal_convertor_t* conv,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -341,27 +337,29 @@ static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl,
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
*des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, conv,
|
||||
*des = btl->btl_prepare_src( btl, bml_btl->btl_endpoint, reg, conv,
|
||||
order, reserve, size, flags );
|
||||
if( OPAL_LIKELY((*des) != NULL) ) {
|
||||
(*des)->des_context = (void*) bml_btl;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mca_bml_base_register_mem (mca_bml_base_btl_t* bml_btl, void *base,
|
||||
size_t size, uint32_t flags,
|
||||
mca_btl_base_registration_handle_t **handle)
|
||||
{
|
||||
static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl,
|
||||
mca_mpool_base_registration_t* reg,
|
||||
struct opal_convertor_t* conv,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t *size,
|
||||
uint32_t flags,
|
||||
mca_btl_base_descriptor_t** des)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
*handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, size, flags);
|
||||
}
|
||||
|
||||
static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
|
||||
btl->btl_deregister_mem (btl, handle);
|
||||
*des = btl->btl_prepare_dst( btl, bml_btl->btl_endpoint, reg, conv,
|
||||
order, reserve, size, flags );
|
||||
if( OPAL_LIKELY((*des) != NULL) ) {
|
||||
(*des)->des_context = (void*) bml_btl;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -86,7 +86,9 @@ static int mca_bml_r2_add_btls( void )
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) {
|
||||
for(selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_first(btls);
|
||||
selected_btl != (mca_btl_base_selected_module_t*)opal_list_get_end(btls);
|
||||
selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_next(selected_btl)) {
|
||||
mca_btl_base_module_t *btl = selected_btl->btl_module;
|
||||
mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl;
|
||||
for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) {
|
||||
@ -125,23 +127,6 @@ static int btl_bandwidth_compare(const void *v1, const void *v2)
|
||||
return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
|
||||
}
|
||||
|
||||
static void mca_bml_r2_calculate_bandwidth_latency (mca_bml_base_btl_array_t *btl_array, double *total_bandwidth, uint32_t *latency)
|
||||
{
|
||||
const size_t array_length = mca_bml_base_btl_array_get_size (btl_array);
|
||||
|
||||
*latency = UINT_MAX;
|
||||
*total_bandwidth = 0.;
|
||||
|
||||
for (size_t i = 0 ; i < array_length ; ++i) {
|
||||
mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_index (btl_array, i);
|
||||
mca_btl_base_module_t *btl = bml_btl->btl;
|
||||
*total_bandwidth += btl->btl_bandwidth;
|
||||
if (btl->btl_latency < *latency) {
|
||||
*latency = btl->btl_latency;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For each proc setup a datastructure that indicates the BTLs
|
||||
* that can be used to reach the destination.
|
||||
@ -204,7 +189,6 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
|
||||
mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
|
||||
int btl_inuse = 0;
|
||||
int btl_flags;
|
||||
|
||||
/* if the r2 can reach the destination proc it sets the
|
||||
* corresponding bit (proc index) in the reachable bitmap
|
||||
@ -228,7 +212,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
ompi_proc_t *proc = new_procs[p];
|
||||
mca_bml_base_endpoint_t * bml_endpoint =
|
||||
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
mca_bml_base_btl_t* bml_btl = NULL;
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
size_t size;
|
||||
|
||||
if(NULL == bml_endpoint) {
|
||||
@ -252,35 +236,12 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
bml_endpoint->btl_flags_or = 0;
|
||||
}
|
||||
|
||||
btl_flags = btl->btl_flags;
|
||||
if( (btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
|
||||
" the %s BTL without any PUT function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
btl_flags ^= MCA_BTL_FLAGS_PUT;
|
||||
}
|
||||
if( (btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
|
||||
" the %s BTL without any GET function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
btl_flags ^= MCA_BTL_FLAGS_GET;
|
||||
}
|
||||
|
||||
if( (btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
|
||||
/**
|
||||
* If no protocol specified, we have 2 choices: we ignore the BTL
|
||||
* as we don't know which protocl to use, or we suppose that all
|
||||
* BTLs support the send protocol.
|
||||
*/
|
||||
btl_flags |= MCA_BTL_FLAGS_SEND;
|
||||
}
|
||||
|
||||
/* dont allow an additional BTL with a lower exclusivity ranking */
|
||||
size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
if(size > 0) {
|
||||
bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
|
||||
/* skip this btl if the exclusivity is less than the previous only if the btl does not provide full rdma (for one-sided) */
|
||||
if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity && ((btl_flags & MCA_BTL_FLAGS_RDMA) != MCA_BTL_FLAGS_RDMA)) {
|
||||
/* skip this btl if the exclusivity is less than the previous */
|
||||
if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) {
|
||||
btl->btl_del_procs(btl, 1, (opal_proc_t**)&proc, &btl_endpoints[p]);
|
||||
opal_output_verbose(20, opal_btl_base_framework.framework_output,
|
||||
"mca: bml: Not using %s btl to %s on node %s "
|
||||
@ -300,44 +261,39 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
proc->super.proc_hostname);
|
||||
|
||||
/* cache the endpoint on the proc */
|
||||
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
|
||||
bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
|
||||
bml_btl->btl = btl;
|
||||
bml_btl->btl_endpoint = btl_endpoints[p];
|
||||
bml_btl->btl_weight = 0;
|
||||
bml_btl->btl_flags = btl_flags;
|
||||
|
||||
bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
|
||||
bml_btl->btl = btl;
|
||||
bml_btl->btl_endpoint = btl_endpoints[p];
|
||||
bml_btl->btl_weight = 0;
|
||||
bml_btl->btl_flags = btl->btl_flags;
|
||||
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
|
||||
" the %s BTL without any PUT function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
|
||||
}
|
||||
if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
|
||||
opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
|
||||
" the %s BTL without any GET function attached. Discard the flag !",
|
||||
bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
|
||||
}
|
||||
if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
|
||||
/**
|
||||
* calculate the bitwise OR of the btl flags
|
||||
* If no protocol specified, we have 2 choices: we ignore the BTL
|
||||
* as we don't know which protocl to use, or we suppose that all
|
||||
* BTLs support the send protocol.
|
||||
*/
|
||||
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
|
||||
bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND;
|
||||
}
|
||||
|
||||
/* always add rdma endpoints */
|
||||
if ((btl_flags & MCA_BTL_FLAGS_RDMA) &&
|
||||
!((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
|
||||
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
|
||||
mca_bml_base_btl_t *bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
|
||||
|
||||
bml_btl_rdma->btl = btl;
|
||||
bml_btl_rdma->btl_endpoint = btl_endpoints[p];
|
||||
bml_btl_rdma->btl_weight = 0;
|
||||
bml_btl_rdma->btl_flags = btl_flags;
|
||||
|
||||
if (bml_endpoint->btl_pipeline_send_length < btl->btl_rdma_pipeline_send_length) {
|
||||
bml_endpoint->btl_pipeline_send_length = btl->btl_rdma_pipeline_send_length;
|
||||
}
|
||||
|
||||
if (bml_endpoint->btl_send_limit < btl->btl_min_rdma_pipeline_size) {
|
||||
bml_endpoint->btl_send_limit = btl->btl_min_rdma_pipeline_size;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* calculate the bitwise OR of the btl flags
|
||||
*/
|
||||
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
|
||||
/* This BTL is in use, allow the progress registration */
|
||||
btl_inuse++;
|
||||
}
|
||||
}
|
||||
|
||||
if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
|
||||
size_t p;
|
||||
bool found = false;
|
||||
@ -363,8 +319,9 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
mca_bml_base_endpoint_t* bml_endpoint =
|
||||
(mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
double total_bandwidth = 0;
|
||||
uint32_t latency;
|
||||
size_t n_send, n_rdma;
|
||||
uint32_t latency = 0xffffffff;
|
||||
size_t n_index;
|
||||
size_t n_size;
|
||||
|
||||
/* skip over procs w/ no btl's registered */
|
||||
if(NULL == bml_endpoint) {
|
||||
@ -378,22 +335,28 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
* weighting. Once the left over is smaller than this number we will
|
||||
* start using the weight to compute the correct amount.
|
||||
*/
|
||||
n_send = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
n_rdma = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
|
||||
n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
|
||||
|
||||
/* sort BTLs in descending order according to bandwidth value */
|
||||
qsort(bml_endpoint->btl_send.bml_btls, n_send,
|
||||
qsort(bml_endpoint->btl_send.bml_btls, n_size,
|
||||
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
|
||||
|
||||
bml_endpoint->btl_rdma_index = 0;
|
||||
|
||||
mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_send, &total_bandwidth, &latency);
|
||||
|
||||
for(n_index = 0; n_index < n_size; n_index++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
|
||||
mca_btl_base_module_t* btl = bml_btl->btl;
|
||||
total_bandwidth += bml_btl->btl->btl_bandwidth;
|
||||
if(btl->btl_latency < latency) {
|
||||
latency = btl->btl_latency;
|
||||
}
|
||||
}
|
||||
|
||||
/* (1) set the weight of each btl as a percentage of overall bandwidth
|
||||
* (2) copy all btl instances at the highest priority ranking into the
|
||||
* list of btls used for first fragments
|
||||
*/
|
||||
for (size_t n_index = 0 ; n_index < n_send ; ++n_index) {
|
||||
for(n_index = 0; n_index < n_size; n_index++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
|
||||
mca_btl_base_module_t *btl = bml_btl->btl;
|
||||
@ -402,7 +365,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
if(btl->btl_bandwidth > 0) {
|
||||
bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
|
||||
} else {
|
||||
bml_btl->btl_weight = (float)(1.0 / n_send);
|
||||
bml_btl->btl_weight = (float)(1.0 / n_size);
|
||||
}
|
||||
|
||||
/* check to see if this r2 is already in the array of r2s
|
||||
@ -417,24 +380,21 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
/* set endpoint max send size as min of available btls */
|
||||
if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
|
||||
bml_endpoint->btl_max_send_size = btl->btl_max_send_size;
|
||||
}
|
||||
|
||||
/* sort BTLs in descending order according to bandwidth value */
|
||||
qsort(bml_endpoint->btl_rdma.bml_btls, n_rdma,
|
||||
sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
|
||||
/* check flags - is rdma prefered */
|
||||
if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) &&
|
||||
!((proc->super.proc_arch != ompi_proc_local_proc->super.proc_arch) &&
|
||||
(0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
|
||||
mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
|
||||
mca_btl_base_module_t* btl_rdma = bml_btl->btl;
|
||||
|
||||
mca_bml_r2_calculate_bandwidth_latency (&bml_endpoint->btl_rdma, &total_bandwidth, &latency);
|
||||
|
||||
/* set rdma btl weights */
|
||||
for (size_t n_index = 0 ; n_index < n_rdma ; ++n_index) {
|
||||
mca_bml_base_btl_t *bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, n_index);
|
||||
|
||||
/* compute weighting factor for this r2 */
|
||||
if (bml_btl->btl->btl_bandwidth > 0.0) {
|
||||
bml_btl->btl_weight = (float)(bml_btl->btl->btl_bandwidth / total_bandwidth);
|
||||
} else {
|
||||
bml_btl->btl_weight = (float)(1.0 / n_rdma);
|
||||
*bml_btl_rdma = *bml_btl;
|
||||
if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) {
|
||||
bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length;
|
||||
}
|
||||
if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) {
|
||||
bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,20 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
|
||||
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ompi_osc_pt2pt_CONFIG([action-if-can-compile],
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
# We can always build, unless we were explicitly disabled.
|
||||
AC_DEFUN([MCA_ompi_osc_pt2pt_CONFIG],[
|
||||
AC_CONFIG_FILES([ompi/mca/osc/pt2pt/Makefile])
|
||||
[$1]
|
||||
])dnl
|
@ -1,203 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
|
||||
#include "osc_pt2pt.h"
|
||||
#include "osc_pt2pt_frag.h"
|
||||
#include "osc_pt2pt_data_move.h"
|
||||
|
||||
static void ompi_osc_pt2pt_frag_constructor (ompi_osc_pt2pt_frag_t *frag){
|
||||
frag->buffer = frag->super.ptr;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_frag_t, ompi_free_list_item_t,
|
||||
ompi_osc_pt2pt_frag_constructor, NULL);
|
||||
|
||||
static int frag_send_cb (ompi_request_t *request)
|
||||
{
|
||||
ompi_osc_pt2pt_frag_t *frag =
|
||||
(ompi_osc_pt2pt_frag_t*) request->req_complete_cb_data;
|
||||
ompi_osc_pt2pt_module_t *module = frag->module;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag_send complete to %d, frag = %p, request = %p",
|
||||
frag->target, (void *) frag, (void *) request));
|
||||
|
||||
mark_outgoing_completion(module);
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.frags, &frag->super);
|
||||
|
||||
|
||||
/* put this request on the garbage colletion list */
|
||||
osc_pt2pt_gc_add_request (module, request);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int
|
||||
frag_send(ompi_osc_pt2pt_module_t *module,
|
||||
ompi_osc_pt2pt_frag_t *frag)
|
||||
{
|
||||
int count;
|
||||
|
||||
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag_send called to %d, frag = %p, count = %d",
|
||||
frag->target, (void *) frag, count));
|
||||
|
||||
/* we need to signal now that a frag is outgoing to ensure the count sent
|
||||
* with the unlock message is correct */
|
||||
ompi_osc_signal_outgoing (module, frag->target, 1);
|
||||
|
||||
return ompi_osc_pt2pt_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_PT2PT_FRAG_TAG,
|
||||
module->comm, frag_send_cb, frag);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module,
|
||||
ompi_osc_pt2pt_frag_t *frag)
|
||||
{
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + frag->target;
|
||||
int ret;
|
||||
|
||||
assert(0 == frag->pending && peer->active_frag != frag);
|
||||
|
||||
/* we need to signal now that a frag is outgoing to ensure the count sent
|
||||
* with the unlock message is correct */
|
||||
ompi_osc_signal_outgoing (module, frag->target, 1);
|
||||
|
||||
/* if eager sends are not active, can't send yet, so buffer and
|
||||
get out... */
|
||||
if (!(peer->eager_send_active || module->all_access_epoch)) {
|
||||
OPAL_THREAD_SCOPED_LOCK(&module->queued_frags_lock,
|
||||
opal_list_append(&module->queued_frags, (opal_list_item_t *) frag));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ret = frag_send(module, frag);
|
||||
|
||||
opal_condition_broadcast(&module->cond);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target)
|
||||
{
|
||||
ompi_osc_pt2pt_frag_t *next, *frag = module->peers[target].active_frag;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag flush target begin"));
|
||||
|
||||
/* flush the active frag */
|
||||
if (NULL != frag) {
|
||||
if (1 != frag->pending) {
|
||||
/* communication going on while synchronizing; this is an rma usage bug */
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
if (opal_atomic_cmpset (&module->peers[target].active_frag, frag, NULL)) {
|
||||
OPAL_THREAD_ADD32(&frag->pending, -1);
|
||||
ret = ompi_osc_pt2pt_frag_start(module, frag);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag flush target finished active frag"));
|
||||
|
||||
/* walk through the pending list and send */
|
||||
OPAL_THREAD_LOCK(&module->queued_frags_lock);
|
||||
if (opal_list_get_size (&module->queued_frags)) {
|
||||
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
|
||||
if (frag->target == target) {
|
||||
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
|
||||
ret = frag_send(module, frag);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != frag)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag flush target finished"));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
int i;
|
||||
ompi_osc_pt2pt_frag_t *frag, *next;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag flush all begin"));
|
||||
|
||||
/* flush the active frag */
|
||||
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
ompi_osc_pt2pt_frag_t *frag = module->peers[i].active_frag;
|
||||
|
||||
if (NULL != frag) {
|
||||
if (1 != frag->pending) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
/* communication going on while synchronizing; this is a bug */
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
if (!opal_atomic_cmpset_ptr (&module->peers[i].active_frag, frag, NULL)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_THREAD_ADD32(&frag->pending, -1);
|
||||
ret = ompi_osc_pt2pt_frag_start(module, frag);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag flush all finished active frag"));
|
||||
|
||||
/* try to start all the queued frags */
|
||||
OPAL_THREAD_LOCK(&module->queued_frags_lock);
|
||||
if (opal_list_get_size (&module->queued_frags)) {
|
||||
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
|
||||
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
|
||||
ret = frag_send(module, frag);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: frag flush all done"));
|
||||
|
||||
return ret;
|
||||
}
|
@ -1,143 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OSC_PT2PT_FRAG_H
|
||||
#define OSC_PT2PT_FRAG_H
|
||||
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "osc_pt2pt_header.h"
|
||||
#include "osc_pt2pt_request.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
/** Communication buffer for packing messages */
|
||||
struct ompi_osc_pt2pt_frag_t {
|
||||
ompi_free_list_item_t super;
|
||||
/* target rank of buffer */
|
||||
int target;
|
||||
unsigned char *buffer;
|
||||
|
||||
/* space remaining in buffer */
|
||||
size_t remain_len;
|
||||
|
||||
/* start of unused space */
|
||||
char *top;
|
||||
|
||||
/* Number of operations which have started writing into the frag, but not yet completed doing so */
|
||||
int32_t pending;
|
||||
ompi_osc_pt2pt_frag_header_t *header;
|
||||
ompi_osc_pt2pt_module_t *module;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_frag_t ompi_osc_pt2pt_frag_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_frag_t);
|
||||
|
||||
extern int ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_frag_t *buffer);
|
||||
extern int ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target);
|
||||
extern int ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module);
|
||||
|
||||
|
||||
/*
|
||||
* Note: module lock must be held during this operation
|
||||
*/
|
||||
static inline int ompi_osc_pt2pt_frag_alloc(ompi_osc_pt2pt_module_t *module, int target,
|
||||
size_t request_len, ompi_osc_pt2pt_frag_t **buffer,
|
||||
char **ptr)
|
||||
{
|
||||
ompi_osc_pt2pt_frag_t *curr = module->peers[target].active_frag;
|
||||
int ret;
|
||||
|
||||
/* osc pt2pt headers can have 64-bit values. these will need to be aligned
|
||||
* on an 8-byte boundary on some architectures so we up align the allocation
|
||||
* size here. */
|
||||
request_len = OPAL_ALIGN(request_len, 8, size_t);
|
||||
|
||||
if (request_len > mca_osc_pt2pt_component.buffer_size) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
if (NULL == curr || curr->remain_len < request_len) {
|
||||
ompi_free_list_item_t *item = NULL;
|
||||
|
||||
if (NULL != curr) {
|
||||
curr->remain_len = 0;
|
||||
module->peers[target].active_frag = NULL;
|
||||
opal_atomic_mb ();
|
||||
|
||||
/* If there's something pending, the pending finish will
|
||||
start the buffer. Otherwise, we need to start it now. */
|
||||
if (0 == OPAL_THREAD_ADD32(&curr->pending, -1)) {
|
||||
ret = ompi_osc_pt2pt_frag_start(module, curr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.frags, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
curr = module->peers[target].active_frag =
|
||||
(ompi_osc_pt2pt_frag_t*) item;
|
||||
|
||||
curr->target = target;
|
||||
|
||||
curr->header = (ompi_osc_pt2pt_frag_header_t*) curr->buffer;
|
||||
curr->top = (char*) (curr->header + 1);
|
||||
curr->remain_len = mca_osc_pt2pt_component.buffer_size;
|
||||
curr->module = module;
|
||||
curr->pending = 1;
|
||||
|
||||
curr->header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_FRAG;
|
||||
curr->header->base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
|
||||
if (module->passive_target_access_epoch) {
|
||||
curr->header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
|
||||
}
|
||||
curr->header->source = ompi_comm_rank(module->comm);
|
||||
curr->header->num_ops = 0;
|
||||
curr->header->windx = ompi_comm_get_cid(module->comm);
|
||||
|
||||
if (curr->remain_len < request_len) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
|
||||
*ptr = curr->top;
|
||||
*buffer = curr;
|
||||
|
||||
curr->top += request_len;
|
||||
curr->remain_len -= request_len;
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_THREAD_ADD32(&curr->pending, 1);
|
||||
OPAL_THREAD_ADD32(&curr->header->num_ops, 1);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Note: module lock must be held for this operation
|
||||
*/
|
||||
static inline int ompi_osc_pt2pt_frag_finish(ompi_osc_pt2pt_module_t *module,
|
||||
ompi_osc_pt2pt_frag_t* buffer)
|
||||
{
|
||||
if (0 == OPAL_THREAD_ADD32(&buffer->pending, -1)) {
|
||||
return ompi_osc_pt2pt_frag_start(module, buffer);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
@ -1,189 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_MCA_OSC_PT2PT_HDR_H
|
||||
#define OMPI_MCA_OSC_PT2PT_HDR_H
|
||||
|
||||
#ifdef HAVE_NETINET_IN_H
|
||||
#include <netinet/in.h>
|
||||
#endif
|
||||
|
||||
#include "opal/types.h"
|
||||
|
||||
enum ompi_osc_pt2pt_hdr_type_t {
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_PUT = 0x01,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_PUT_LONG = 0x02,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_ACC = 0x03,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG = 0x04,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_GET = 0x05,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP = 0x06,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP_LONG = 0x07,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC = 0x08,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG = 0x09,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE = 0x10,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_POST = 0x11,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ = 0x12,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK = 0x13,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ = 0x14,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK = 0x15,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ = 0x16,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK = 0x17,
|
||||
OMPI_OSC_PT2PT_HDR_TYPE_FRAG = 0x20,
|
||||
};
|
||||
typedef enum ompi_osc_pt2pt_hdr_type_t ompi_osc_pt2pt_hdr_type_t;
|
||||
|
||||
#define OMPI_OSC_PT2PT_HDR_FLAG_NBO 0x01
|
||||
#define OMPI_OSC_PT2PT_HDR_FLAG_VALID 0x02
|
||||
#define OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET 0x04
|
||||
#define OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE 0x08
|
||||
|
||||
struct ompi_osc_pt2pt_header_base_t {
|
||||
/** fragment type. 8 bits */
|
||||
uint8_t type;
|
||||
/** fragment flags. 8 bits */
|
||||
uint8_t flags;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_base_t ompi_osc_pt2pt_header_base_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_put_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
uint32_t count;
|
||||
uint64_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_put_t ompi_osc_pt2pt_header_put_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_acc_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
uint32_t count;
|
||||
uint32_t op;
|
||||
uint64_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_acc_t ompi_osc_pt2pt_header_acc_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_get_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
uint32_t count;
|
||||
uint64_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_get_t ompi_osc_pt2pt_header_get_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_complete_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
int frag_count;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_complete_t ompi_osc_pt2pt_header_complete_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_cswap_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
|
||||
uint32_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_cswap_t ompi_osc_pt2pt_header_cswap_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_post_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
uint16_t windx;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_post_t ompi_osc_pt2pt_header_post_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_lock_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
int32_t lock_type;
|
||||
uint64_t lock_ptr;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_lock_t ompi_osc_pt2pt_header_lock_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_lock_ack_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
uint16_t windx;
|
||||
uint32_t source;
|
||||
uint64_t lock_ptr;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_lock_ack_t ompi_osc_pt2pt_header_lock_ack_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_unlock_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
int32_t lock_type;
|
||||
uint32_t frag_count;
|
||||
uint64_t lock_ptr;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_unlock_t ompi_osc_pt2pt_header_unlock_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_unlock_ack_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
uint64_t lock_ptr;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_unlock_ack_t ompi_osc_pt2pt_header_unlock_ack_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_flush_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
uint32_t frag_count;
|
||||
uint64_t serial_number;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_flush_t ompi_osc_pt2pt_header_flush_t;
|
||||
|
||||
struct ompi_osc_pt2pt_header_flush_ack_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
uint64_t serial_number;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_header_flush_ack_t ompi_osc_pt2pt_header_flush_ack_t;
|
||||
|
||||
struct ompi_osc_pt2pt_frag_header_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
uint16_t windx; /* cid of communicator backing window (our window id) */
|
||||
uint32_t source; /* rank in window of source process */
|
||||
int32_t num_ops; /* number of operations in this buffer */
|
||||
uint32_t pad; /* ensure the fragment header is a multiple of 8 bytes */
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_frag_header_t ompi_osc_pt2pt_frag_header_t;
|
||||
|
||||
union ompi_osc_pt2pt_header_t {
|
||||
ompi_osc_pt2pt_header_base_t base;
|
||||
ompi_osc_pt2pt_header_put_t put;
|
||||
ompi_osc_pt2pt_header_acc_t acc;
|
||||
ompi_osc_pt2pt_header_get_t get;
|
||||
ompi_osc_pt2pt_header_complete_t complete;
|
||||
ompi_osc_pt2pt_header_cswap_t cswap;
|
||||
ompi_osc_pt2pt_header_post_t post;
|
||||
ompi_osc_pt2pt_header_lock_t lock;
|
||||
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
|
||||
ompi_osc_pt2pt_header_unlock_t unlock;
|
||||
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
|
||||
ompi_osc_pt2pt_header_flush_t flush;
|
||||
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
|
||||
ompi_osc_pt2pt_frag_header_t frag;
|
||||
};
|
||||
typedef union ompi_osc_pt2pt_header_t ompi_osc_pt2pt_header_t;
|
||||
|
||||
#endif /* OMPI_MCA_OSC_PT2PT_HDR_H */
|
@ -1,925 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "osc_pt2pt.h"
|
||||
#include "osc_pt2pt_header.h"
|
||||
#include "osc_pt2pt_data_move.h"
|
||||
#include "osc_pt2pt_frag.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
#include "opal/include/opal_stdint.h"
|
||||
|
||||
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type,
|
||||
uint64_t serial_number);
|
||||
|
||||
/* target-side tracking of a lock request */
|
||||
struct ompi_osc_pt2pt_pending_lock_t {
|
||||
opal_list_item_t super;
|
||||
int peer;
|
||||
int lock_type;
|
||||
uint64_t lock_ptr;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_pending_lock_t ompi_osc_pt2pt_pending_lock_t;
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_lock_t, opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
|
||||
/* origin-side tracking of a lock request */
|
||||
struct ompi_osc_pt2pt_outstanding_lock_t {
|
||||
opal_list_item_t super;
|
||||
int target;
|
||||
int assert;
|
||||
bool flushing;
|
||||
int32_t lock_acks_received;
|
||||
int32_t unlock_acks_received;
|
||||
int32_t flush_acks_received;
|
||||
uint64_t serial_number;
|
||||
int32_t type;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_outstanding_lock_t ompi_osc_pt2pt_outstanding_lock_t;
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_outstanding_lock_t, opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module);
|
||||
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor, int lock_type, uint64_t lock_ptr);
|
||||
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
|
||||
int target);
|
||||
|
||||
|
||||
/**
|
||||
* Find the first outstanding lock to a target.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] target - Target rank
|
||||
*
|
||||
* @returns an outstanding lock on success
|
||||
*
|
||||
* This function traverses the outstanding_locks list in the module
|
||||
* looking for a lock that matches target. The caller must hold the
|
||||
* module lock.
|
||||
*/
|
||||
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_st (ompi_osc_pt2pt_module_t *module, int target)
|
||||
{
|
||||
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
|
||||
|
||||
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
|
||||
if (outstanding_lock->target == target) {
|
||||
lock = outstanding_lock;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return lock;
|
||||
}
|
||||
|
||||
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock (ompi_osc_pt2pt_module_t *module, int target)
|
||||
{
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
lock = find_outstanding_lock_st (module, target);
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return lock;
|
||||
}
|
||||
|
||||
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_pt2pt_module_t *module, uint64_t serial_number)
|
||||
{
|
||||
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
|
||||
if (outstanding_lock->serial_number == serial_number) {
|
||||
lock = outstanding_lock;
|
||||
break;
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return lock;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_pt2pt_lock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
|
||||
{
|
||||
const int my_rank = ompi_comm_rank (module->comm);
|
||||
bool acquired = false;
|
||||
|
||||
acquired = ompi_osc_pt2pt_lock_try_acquire (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
|
||||
if (!acquired) {
|
||||
/* queue the lock */
|
||||
queue_lock (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
|
||||
|
||||
/* If locking local, can't be non-blocking according to the
|
||||
standard. We need to wait for the ack here. */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
while (0 == lock->lock_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"local lock aquired"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void ompi_osc_pt2pt_unlock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_unlock_self: unlocking myself. lock state = %d", module->lock_status));
|
||||
|
||||
if (MPI_LOCK_EXCLUSIVE == lock->type) {
|
||||
OPAL_THREAD_ADD32(&module->lock_status, 1);
|
||||
ompi_osc_activate_next_lock (module);
|
||||
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
|
||||
ompi_osc_activate_next_lock (module);
|
||||
}
|
||||
|
||||
/* need to ensure we make progress */
|
||||
opal_progress();
|
||||
|
||||
OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1);
|
||||
}
|
||||
|
||||
static inline int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
|
||||
{
|
||||
ompi_osc_pt2pt_header_lock_t lock_req;
|
||||
int ret;
|
||||
|
||||
/* generate a lock request */
|
||||
lock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ;
|
||||
lock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
|
||||
lock_req.lock_type = lock->type;
|
||||
lock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
|
||||
|
||||
ret = ompi_osc_pt2pt_control_send (module, target, &lock_req, sizeof (lock_req));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* make sure the request gets sent, so we can start eager sending... */
|
||||
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_pt2pt_unlock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
|
||||
{
|
||||
ompi_osc_pt2pt_header_unlock_t unlock_req;
|
||||
int32_t frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
|
||||
|
||||
unlock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ;
|
||||
unlock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
|
||||
unlock_req.frag_count = frag_count;
|
||||
unlock_req.lock_type = lock->type;
|
||||
unlock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
|
||||
|
||||
/* send control message with unlock request and count */
|
||||
return ompi_osc_pt2pt_control_send (module, target, &unlock_req, sizeof (unlock_req));
|
||||
}
|
||||
|
||||
static int ompi_osc_pt2pt_lock_internal_execute (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
|
||||
{
|
||||
int my_rank = ompi_comm_rank (module->comm);
|
||||
int target = lock->target;
|
||||
int assert = lock->assert;
|
||||
int ret;
|
||||
|
||||
if (0 == (assert & MPI_MODE_NOCHECK)) {
|
||||
if (my_rank != target && target != -1) {
|
||||
ret = ompi_osc_pt2pt_lock_remote (module, target, lock);
|
||||
} else {
|
||||
ret = ompi_osc_pt2pt_lock_self (module, lock);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
/* return */
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (-1 == target) {
|
||||
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
if (my_rank == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = ompi_osc_pt2pt_lock_remote (module, i, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
if (-1 == target) {
|
||||
lock->lock_acks_received = ompi_comm_size(module->comm);
|
||||
} else {
|
||||
lock->lock_acks_received = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int ompi_osc_pt2pt_lock_internal (int lock_type, int target, int assert, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
ompi_osc_pt2pt_peer_t *peer = NULL;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (-1 != target) {
|
||||
peer = module->peers + target;
|
||||
}
|
||||
|
||||
/* Check if no_locks is set. TODO: we also need to track whether we are in an
|
||||
* active target epoch. Fence can make this tricky to track. */
|
||||
if (module->sc_group) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: lock %d %d", target, lock_type));
|
||||
|
||||
/* create lock item */
|
||||
lock = OBJ_NEW(ompi_osc_pt2pt_outstanding_lock_t);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
lock->target = target;
|
||||
lock->lock_acks_received = 0;
|
||||
lock->unlock_acks_received = 0;
|
||||
lock->serial_number = OPAL_THREAD_ADD64((int64_t *) &module->lock_serial_number, 1);
|
||||
lock->type = lock_type;
|
||||
lock->assert = assert;
|
||||
|
||||
/* delay all eager sends until we've heard back.. */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* check for conflicting lock */
|
||||
if (find_outstanding_lock_st (module, target)) {
|
||||
OBJ_RELEASE(lock);
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_CONFLICT;
|
||||
}
|
||||
|
||||
/* when the lock ack returns we will be in an access epoch with this peer/all peers (target = -1) */
|
||||
if (-1 == target) {
|
||||
module->all_access_epoch = true;
|
||||
} else {
|
||||
peer->access_epoch = true;
|
||||
}
|
||||
|
||||
module->passive_target_access_epoch = true;
|
||||
|
||||
opal_list_append(&module->outstanding_locks, &lock->super);
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
ret = ompi_osc_pt2pt_lock_internal_execute (module, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OPAL_THREAD_SCOPED_LOCK(&module->lock,
|
||||
opal_list_remove_item(&module->outstanding_locks, &lock->super));
|
||||
OBJ_RELEASE(lock);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ompi_osc_pt2pt_unlock_internal (int target, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock = NULL;
|
||||
int my_rank = ompi_comm_rank (module->comm);
|
||||
ompi_osc_pt2pt_peer_t *peer = NULL;
|
||||
int lock_acks_expected;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (-1 != target) {
|
||||
lock_acks_expected = 1;
|
||||
peer = module->peers + target;
|
||||
} else {
|
||||
lock_acks_expected = ompi_comm_size (module->comm);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_unlock_internal: unlocking target %d", target));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
lock = find_outstanding_lock_st (module, target);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_unlock: target %d is not locked in window %s",
|
||||
target, win->w_name));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
opal_list_remove_item (&module->outstanding_locks, &lock->super);
|
||||
|
||||
/* wait until ack has arrived from target */
|
||||
while (lock->lock_acks_received != lock_acks_expected) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
if (lock->assert & MPI_MODE_NOCHECK) {
|
||||
/* flush intstead */
|
||||
ompi_osc_pt2pt_flush_lock (module, lock, target);
|
||||
} else if (my_rank != target) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: unlock %d, lock_acks_received = %d", target,
|
||||
lock->lock_acks_received));
|
||||
|
||||
if (-1 == target) {
|
||||
/* send unlock messages to all of my peers */
|
||||
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
if (my_rank == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = ompi_osc_pt2pt_unlock_remote (module, i, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
ret = ompi_osc_pt2pt_unlock_remote (module, target, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
if (-1 == target) {
|
||||
ret = ompi_osc_pt2pt_frag_flush_all (module);
|
||||
} else {
|
||||
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait for unlock acks. this signals remote completion of fragments */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
while (lock->unlock_acks_received != lock_acks_expected) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_unlock: unlock of %d complete", target));
|
||||
}
|
||||
|
||||
if ((target == my_rank || target == -1) && !(lock->assert & MPI_MODE_NOCHECK)) {
|
||||
ompi_osc_pt2pt_unlock_self (module, lock);
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
if (-1 != target) {
|
||||
peer->access_epoch = false;
|
||||
module->passive_target_access_epoch = false;
|
||||
} else {
|
||||
module->passive_target_access_epoch = false;
|
||||
module->all_access_epoch = false;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OBJ_RELEASE(lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_pt2pt_lock(int lock_type, int target, int assert, ompi_win_t *win)
|
||||
{
|
||||
assert(target >= 0);
|
||||
|
||||
return ompi_osc_pt2pt_lock_internal (lock_type, target, assert, win);
|
||||
}
|
||||
|
||||
int ompi_osc_pt2pt_unlock (int target, struct ompi_win_t *win)
|
||||
{
|
||||
return ompi_osc_pt2pt_unlock_internal (target, win);
|
||||
}
|
||||
|
||||
int ompi_osc_pt2pt_lock_all(int assert, struct ompi_win_t *win)
|
||||
{
|
||||
return ompi_osc_pt2pt_lock_internal (MPI_LOCK_SHARED, -1, assert, win);
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_pt2pt_unlock_all (struct ompi_win_t *win)
|
||||
{
|
||||
return ompi_osc_pt2pt_unlock_internal (-1, win);
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_pt2pt_sync (struct ompi_win_t *win)
|
||||
{
|
||||
opal_progress();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
|
||||
int target)
|
||||
{
|
||||
ompi_osc_pt2pt_header_flush_t flush_req;
|
||||
int peer_count, ret, flush_count;
|
||||
int my_rank = ompi_comm_rank (module->comm);
|
||||
|
||||
if (-1 == lock->target) {
|
||||
peer_count = ompi_comm_size(module->comm);
|
||||
} else {
|
||||
peer_count = 1;
|
||||
}
|
||||
|
||||
/* wait until ack has arrived from target, since we need to be
|
||||
able to eager send before we can transfer all the data... */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
while (peer_count > lock->lock_acks_received && lock->flushing) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
lock->flush_acks_received = 0;
|
||||
lock->flushing = true;
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
flush_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ;
|
||||
flush_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
|
||||
flush_req.serial_number = lock->serial_number;
|
||||
|
||||
if (-1 == target) {
|
||||
/* NTH: no local flush */
|
||||
flush_count = ompi_comm_size(module->comm) - 1;
|
||||
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
if (i == my_rank) {
|
||||
continue;
|
||||
}
|
||||
|
||||
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + i, -1);
|
||||
|
||||
/* send control message with flush request and count */
|
||||
ret = ompi_osc_pt2pt_control_send (module, i, &flush_req, sizeof (flush_req));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_pt2pt_frag_flush_target (module, i);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
|
||||
flush_count = 1;
|
||||
/* send control message with flush request and count */
|
||||
ret = ompi_osc_pt2pt_control_send (module, target, &flush_req, sizeof (flush_req));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* wait for all the requests and the flush ack (meaning remote completion) */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
while (flush_count != lock->flush_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
lock->flushing = false;
|
||||
opal_condition_broadcast(&module->cond);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_pt2pt_flush (int target, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
int ret;
|
||||
|
||||
assert (0 <= target);
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!module->passive_target_access_epoch) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_flush starting..."));
|
||||
|
||||
if (ompi_comm_rank (module->comm) == target) {
|
||||
/* nothing to flush */
|
||||
opal_progress ();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
lock = find_outstanding_lock (module, target);
|
||||
if (NULL == lock) {
|
||||
lock = find_outstanding_lock (module, -1);
|
||||
}
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_flush: target %d is not locked in window %s",
|
||||
target, win->w_name));
|
||||
ret = OMPI_ERR_RMA_SYNC;
|
||||
} else {
|
||||
ret = ompi_osc_pt2pt_flush_lock (module, lock, target);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_pt2pt_flush_all (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (OPAL_UNLIKELY(!module->passive_target_access_epoch ||
|
||||
0 == opal_list_get_size (&module->outstanding_locks))) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_flush_all: no targets are locked in window %s",
|
||||
win->w_name));
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_flush_all entering..."));
|
||||
|
||||
/* flush all locks */
|
||||
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
|
||||
ret = ompi_osc_pt2pt_flush_lock (module, lock, lock->target);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_flush_all complete"));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_pt2pt_flush_local (int target, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
int ret;
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!module->passive_target_access_epoch) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait for all the requests */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_pt2pt_flush_local_all (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!module->passive_target_access_epoch) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
ret = ompi_osc_pt2pt_frag_flush_all(module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait for all the requests */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* target side operation to acknowledge to initiator side that the
|
||||
lock is now held by the initiator */
|
||||
static inline int activate_lock (ompi_osc_pt2pt_module_t *module, int requestor,
|
||||
uint64_t lock_ptr)
|
||||
{
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
|
||||
if (ompi_comm_rank (module->comm) != requestor) {
|
||||
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
|
||||
|
||||
lock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK;
|
||||
lock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
|
||||
lock_ack.source = ompi_comm_rank(module->comm);
|
||||
lock_ack.windx = ompi_comm_get_cid(module->comm);
|
||||
lock_ack.lock_ptr = lock_ptr;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: sending lock to %d", requestor));
|
||||
|
||||
/* we don't want to send any data, since we're the exposure
|
||||
epoch only, so use an unbuffered send */
|
||||
return ompi_osc_pt2pt_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
|
||||
}
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: releasing local lock"));
|
||||
|
||||
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ptr;
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
|
||||
"lock could not be located"));
|
||||
}
|
||||
|
||||
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
|
||||
opal_condition_broadcast (&module->cond);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* target side operation to create a pending lock request for a lock
|
||||
request that could not be satisfied */
|
||||
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor,
|
||||
int lock_type, uint64_t lock_ptr)
|
||||
{
|
||||
ompi_osc_pt2pt_pending_lock_t *pending =
|
||||
OBJ_NEW(ompi_osc_pt2pt_pending_lock_t);
|
||||
if (NULL == pending) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
pending->peer = requestor;
|
||||
pending->lock_type = lock_type;
|
||||
pending->lock_ptr = lock_ptr;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: queueing lock request from %d", requestor));
|
||||
|
||||
OPAL_THREAD_SCOPED_LOCK(&module->locks_pending_lock, opal_list_append(&module->locks_pending, &pending->super));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type, uint64_t lock_ptr)
|
||||
{
|
||||
bool queue = false;
|
||||
|
||||
if (MPI_LOCK_SHARED == lock_type) {
|
||||
int32_t lock_status = module->lock_status;
|
||||
|
||||
do {
|
||||
if (lock_status < 0) {
|
||||
queue = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (opal_atomic_cmpset_32 (&module->lock_status, lock_status, lock_status + 1)) {
|
||||
break;
|
||||
}
|
||||
|
||||
lock_status = module->lock_status;
|
||||
} while (1);
|
||||
} else {
|
||||
queue = !opal_atomic_cmpset_32 (&module->lock_status, 0, -1);
|
||||
}
|
||||
|
||||
if (queue) {
|
||||
return false;
|
||||
}
|
||||
|
||||
activate_lock(module, source, lock_ptr);
|
||||
|
||||
/* activated the lock */
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module) {
|
||||
/* release any other pending locks we can */
|
||||
ompi_osc_pt2pt_pending_lock_t *pending_lock, *next;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->locks_pending_lock);
|
||||
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
|
||||
ompi_osc_pt2pt_pending_lock_t) {
|
||||
bool acquired = ompi_osc_pt2pt_lock_try_acquire (module, pending_lock->peer, pending_lock->lock_type,
|
||||
pending_lock->lock_ptr);
|
||||
if (!acquired) {
|
||||
break;
|
||||
}
|
||||
|
||||
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
|
||||
OBJ_RELEASE(pending_lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->locks_pending_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/* target side function called when the initiator sends a lock
|
||||
request. Lock will either be activated and acknowledged or
|
||||
queued. */
|
||||
int ompi_osc_pt2pt_process_lock (ompi_osc_pt2pt_module_t* module, int source,
|
||||
ompi_osc_pt2pt_header_lock_t* lock_header)
|
||||
{
|
||||
bool acquired;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_process_lock: processing lock request from %d. current lock state = %d",
|
||||
source, module->lock_status));
|
||||
|
||||
acquired = ompi_osc_pt2pt_lock_try_acquire (module, source, lock_header->lock_type, lock_header->lock_ptr);
|
||||
|
||||
if (!acquired) {
|
||||
queue_lock(module, source, lock_header->lock_type, lock_header->lock_ptr);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* initiator-side function called when the target acks the lock
|
||||
request. */
|
||||
void ompi_osc_pt2pt_process_lock_ack (ompi_osc_pt2pt_module_t *module,
|
||||
ompi_osc_pt2pt_header_lock_ack_t *lock_ack_header)
|
||||
{
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + lock_ack_header->source;
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_process_unlock_ack: processing lock ack from %d for lock %" PRIu64,
|
||||
lock_ack_header->source, lock_ack_header->lock_ptr));
|
||||
|
||||
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ack_header->lock_ptr;
|
||||
assert (NULL != lock);
|
||||
|
||||
/* no need to hold the lock to set this */
|
||||
peer->eager_send_active = true;
|
||||
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
|
||||
|
||||
opal_condition_broadcast(&module->cond);
|
||||
}
|
||||
|
||||
void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
|
||||
ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header) {
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
|
||||
source, flush_ack_header->serial_number));
|
||||
|
||||
/* NTH: need to verify that this will work as expected */
|
||||
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
|
||||
assert (NULL != lock);
|
||||
|
||||
OPAL_THREAD_ADD32(&lock->flush_acks_received, 1);
|
||||
|
||||
opal_condition_broadcast(&module->cond);
|
||||
}
|
||||
|
||||
void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
|
||||
ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header)
|
||||
{
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
|
||||
ompi_osc_pt2pt_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_process_unlock_ack: processing unlock ack from %d",
|
||||
source));
|
||||
|
||||
/* NTH: need to verify that this will work as expected */
|
||||
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (intptr_t) unlock_ack_header->lock_ptr;
|
||||
assert (NULL != lock);
|
||||
|
||||
peer->eager_send_active = false;
|
||||
|
||||
if (0 == OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1)) {
|
||||
opal_condition_broadcast(&module->cond);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an unlock request.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] source - Source rank
|
||||
* @param[in] unlock_header - Incoming unlock header
|
||||
*
|
||||
* This functions is the target-side function for handling an unlock
|
||||
* request. Once all pending operations from the target are complete
|
||||
* this functions sends an unlock acknowledgement then attempts to
|
||||
* active a pending lock if the lock becomes free.
|
||||
*/
|
||||
int ompi_osc_pt2pt_process_unlock (ompi_osc_pt2pt_module_t *module, int source,
|
||||
ompi_osc_pt2pt_header_unlock_t *unlock_header)
|
||||
{
|
||||
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_process_unlock entering (passive_incoming_frag_count: %d)...",
|
||||
peer->passive_incoming_frag_count));
|
||||
|
||||
/* we cannot block when processing an incoming request */
|
||||
if (0 != peer->passive_incoming_frag_count) {
|
||||
return OMPI_ERR_WOULD_BLOCK;
|
||||
}
|
||||
|
||||
unlock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK;
|
||||
unlock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
|
||||
unlock_ack.lock_ptr = unlock_header->lock_ptr;
|
||||
|
||||
ret = ompi_osc_pt2pt_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (-1 == module->lock_status) {
|
||||
OPAL_THREAD_ADD32(&module->lock_status, 1);
|
||||
ompi_osc_activate_next_lock (module);
|
||||
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
|
||||
ompi_osc_activate_next_lock (module);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: finished processing unlock fragment"));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
|
||||
ompi_osc_pt2pt_header_flush_t *flush_header)
|
||||
{
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
|
||||
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_process_flush entering (passive_incoming_frag_count: %d)...",
|
||||
peer->passive_incoming_frag_count));
|
||||
|
||||
/* we cannot block when processing an incoming request */
|
||||
if (0 != peer->passive_incoming_frag_count) {
|
||||
return OMPI_ERR_WOULD_BLOCK;
|
||||
}
|
||||
|
||||
flush_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK;
|
||||
flush_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
|
||||
flush_ack.serial_number = flush_header->serial_number;
|
||||
|
||||
return ompi_osc_pt2pt_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
|
||||
}
|
@ -8,8 +8,6 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -17,39 +15,39 @@
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
pt2pt_sources = \
|
||||
osc_pt2pt.h \
|
||||
osc_pt2pt_module.c \
|
||||
osc_pt2pt_comm.c \
|
||||
osc_pt2pt_component.c \
|
||||
osc_pt2pt_data_move.h \
|
||||
osc_pt2pt_data_move.c \
|
||||
osc_pt2pt_frag.h \
|
||||
osc_pt2pt_frag.c \
|
||||
osc_pt2pt_header.h \
|
||||
osc_pt2pt_obj_convert.h \
|
||||
osc_pt2pt_request.h \
|
||||
osc_pt2pt_request.c \
|
||||
osc_pt2pt_active_target.c \
|
||||
osc_pt2pt_passive_target.c
|
||||
rdma_sources = \
|
||||
osc_rdma.h \
|
||||
osc_rdma.c \
|
||||
osc_rdma_comm.c \
|
||||
osc_rdma_component.c \
|
||||
osc_rdma_data_move.h \
|
||||
osc_rdma_data_move.c \
|
||||
osc_rdma_frag.h \
|
||||
osc_rdma_frag.c \
|
||||
osc_rdma_header.h \
|
||||
osc_rdma_obj_convert.h \
|
||||
osc_rdma_request.h \
|
||||
osc_rdma_request.c \
|
||||
osc_rdma_active_target.c \
|
||||
osc_rdma_passive_target.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_ompi_osc_pt2pt_DSO
|
||||
if MCA_BUILD_ompi_osc_rdma_DSO
|
||||
component_noinst =
|
||||
component_install = mca_osc_pt2pt.la
|
||||
component_install = mca_osc_rdma.la
|
||||
else
|
||||
component_noinst = libmca_osc_pt2pt.la
|
||||
component_noinst = libmca_osc_rdma.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
|
||||
mca_osc_pt2pt_la_LDFLAGS = -module -avoid-version
|
||||
mca_osc_rdma_la_SOURCES = $(rdma_sources)
|
||||
mca_osc_rdma_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
|
||||
libmca_osc_pt2pt_la_LDFLAGS = -module -avoid-version
|
||||
libmca_osc_rdma_la_SOURCES = $(rdma_sources)
|
||||
libmca_osc_rdma_la_LDFLAGS = -module -avoid-version
|
26
ompi/mca/osc/rdma/configure.m4
Обычный файл
26
ompi/mca/osc/rdma/configure.m4
Обычный файл
@ -0,0 +1,26 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ompi_osc_rdma_POST_CONFIG(will_build)
|
||||
# ----------------------------------------
|
||||
# Only require the tag if we're actually going to be built, since bml
|
||||
# is one of the ones frequently disabled for large installs.
|
||||
AC_DEFUN([MCA_ompi_osc_rdma_POST_CONFIG], [
|
||||
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
|
||||
])dnl
|
||||
|
||||
# MCA_ompi_osc_rdma_CONFIG(action-if-can-compile,
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
# We can always build, unless we were explicitly disabled.
|
||||
AC_DEFUN([MCA_ompi_osc_rdma_CONFIG],[
|
||||
AC_CONFIG_FILES([ompi/mca/osc/rdma/Makefile])
|
||||
[$1]
|
||||
])dnl
|
@ -20,7 +20,7 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "osc_pt2pt.h"
|
||||
#include "osc_rdma.h"
|
||||
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
@ -31,24 +31,25 @@
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len)
|
||||
ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base)
|
||||
ompi_osc_rdma_detach(struct ompi_win_t *win, void *base)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_free(ompi_win_t *win)
|
||||
ompi_osc_rdma_free(ompi_win_t *win)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
opal_list_item_t *item;
|
||||
|
||||
if (NULL == module) {
|
||||
return OMPI_SUCCESS;
|
||||
@ -56,7 +57,7 @@ ompi_osc_pt2pt_free(ompi_win_t *win)
|
||||
|
||||
if (NULL != module->comm) {
|
||||
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
|
||||
"pt2pt component destroying window with id %d",
|
||||
"rdma component destroying window with id %d",
|
||||
ompi_comm_get_cid(module->comm));
|
||||
|
||||
/* finish with a barrier */
|
||||
@ -66,38 +67,43 @@ ompi_osc_pt2pt_free(ompi_win_t *win)
|
||||
}
|
||||
|
||||
/* remove from component information */
|
||||
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock,
|
||||
opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules,
|
||||
ompi_comm_get_cid(module->comm)));
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.modules,
|
||||
ompi_comm_get_cid(module->comm));
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
}
|
||||
|
||||
win->w_osc_module = NULL;
|
||||
|
||||
OBJ_DESTRUCT(&module->outstanding_locks);
|
||||
OBJ_DESTRUCT(&module->locks_pending);
|
||||
OBJ_DESTRUCT(&module->locks_pending_lock);
|
||||
OBJ_DESTRUCT(&module->acc_lock);
|
||||
OBJ_DESTRUCT(&module->cond);
|
||||
OBJ_DESTRUCT(&module->lock);
|
||||
|
||||
/* it is erroneous to close a window with active operations on it so we should
|
||||
* probably produce an error here instead of cleaning up */
|
||||
OPAL_LIST_DESTRUCT(&module->pending_acc);
|
||||
OPAL_LIST_DESTRUCT(&module->pending_posts);
|
||||
OPAL_LIST_DESTRUCT(&module->queued_frags);
|
||||
OBJ_DESTRUCT(&module->queued_frags_lock);
|
||||
while (NULL != (item = opal_list_remove_first (&module->pending_acc))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
osc_pt2pt_gc_clean (module);
|
||||
OPAL_LIST_DESTRUCT(&module->request_gc);
|
||||
OPAL_LIST_DESTRUCT(&module->buffer_gc);
|
||||
OBJ_DESTRUCT(&module->gc_lock);
|
||||
OBJ_DESTRUCT(&module->pending_acc);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first (&module->pending_posts))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&module->pending_posts);
|
||||
|
||||
osc_rdma_gc_clean ();
|
||||
|
||||
if (NULL != module->peers) {
|
||||
free(module->peers);
|
||||
}
|
||||
|
||||
if (NULL != module->passive_eager_send_active) free(module->passive_eager_send_active);
|
||||
if (NULL != module->passive_incoming_frag_count) free(module->passive_incoming_frag_count);
|
||||
if (NULL != module->passive_incoming_frag_signal_count) free(module->passive_incoming_frag_signal_count);
|
||||
if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
|
||||
|
||||
if (NULL != module->frag_request) {
|
||||
module->frag_request->req_complete_cb = NULL;
|
||||
ompi_request_cancel (module->frag_request);
|
@ -19,8 +19,8 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_OSC_PT2PT_H
|
||||
#define OMPI_OSC_PT2PT_H
|
||||
#ifndef OMPI_OSC_RDMA_H
|
||||
#define OMPI_OSC_RDMA_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
@ -39,13 +39,13 @@
|
||||
#include "ompi/mca/bml/bml.h"
|
||||
#include "ompi/memchecker.h"
|
||||
|
||||
#include "osc_pt2pt_header.h"
|
||||
#include "osc_rdma_header.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct ompi_osc_pt2pt_frag_t;
|
||||
struct ompi_osc_rdma_frag_t;
|
||||
|
||||
struct ompi_osc_pt2pt_component_t {
|
||||
struct ompi_osc_rdma_component_t {
|
||||
/** Extend the basic osc component interface */
|
||||
ompi_osc_base_component_t super;
|
||||
|
||||
@ -58,45 +58,46 @@ struct ompi_osc_pt2pt_component_t {
|
||||
/** module count */
|
||||
int module_count;
|
||||
|
||||
/** free list of ompi_osc_pt2pt_frag_t structures */
|
||||
ompi_free_list_t frags;
|
||||
/** free list of ompi_osc_rdma_frag_t structures */
|
||||
opal_free_list_t frags;
|
||||
|
||||
/** Free list of requests */
|
||||
ompi_free_list_t requests;
|
||||
|
||||
/** PT2PT component buffer size */
|
||||
/** RDMA component buffer size */
|
||||
unsigned int buffer_size;
|
||||
|
||||
/** Lock for pending_operations */
|
||||
opal_mutex_t pending_operations_lock;
|
||||
|
||||
/** List of operations that need to be processed */
|
||||
opal_list_t pending_operations;
|
||||
|
||||
/** Is the progress function enabled? */
|
||||
bool progress_enable;
|
||||
|
||||
/** List of requests that need to be freed */
|
||||
opal_list_t request_gc;
|
||||
|
||||
/** List of buffers that need to be freed */
|
||||
opal_list_t buffer_gc;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_component_t ompi_osc_pt2pt_component_t;
|
||||
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
|
||||
|
||||
|
||||
struct ompi_osc_pt2pt_peer_t {
|
||||
struct ompi_osc_rdma_peer_t {
|
||||
/** Pointer to the current send fragment for each outgoing target */
|
||||
struct ompi_osc_pt2pt_frag_t *active_frag;
|
||||
struct ompi_osc_rdma_frag_t *active_frag;
|
||||
|
||||
/** Number of acks pending. New requests can not be sent out if there are
|
||||
* acks pending (to fulfill the ordering constraints of accumulate) */
|
||||
uint32_t num_acks_pending;
|
||||
int32_t passive_incoming_frag_count;
|
||||
bool access_epoch;
|
||||
bool eager_send_active;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_peer_t ompi_osc_pt2pt_peer_t;
|
||||
typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t;
|
||||
|
||||
#define SEQ_INVALID 0xFFFFFFFFFFFFFFFFULL
|
||||
|
||||
/** Module structure. Exactly one of these is associated with each
|
||||
PT2PT window */
|
||||
struct ompi_osc_pt2pt_module_t {
|
||||
RDMA window */
|
||||
struct ompi_osc_rdma_module_t {
|
||||
/** Extend the basic osc module interface */
|
||||
ompi_osc_base_module_t super;
|
||||
|
||||
@ -126,15 +127,12 @@ struct ompi_osc_pt2pt_module_t {
|
||||
opal_mutex_t acc_lock;
|
||||
|
||||
/** peer data */
|
||||
ompi_osc_pt2pt_peer_t *peers;
|
||||
ompi_osc_rdma_peer_t *peers;
|
||||
|
||||
/** Nmber of communication fragments started for this epoch, by
|
||||
peer. Not in peer data to make fence more manageable. */
|
||||
uint32_t *epoch_outgoing_frag_count;
|
||||
|
||||
/** Lock for queued_frags */
|
||||
opal_mutex_t queued_frags_lock;
|
||||
|
||||
/** List of full communication buffers queued to be sent. Should
|
||||
be maintained in order (at least in per-target order). */
|
||||
opal_list_t queued_frags;
|
||||
@ -154,6 +152,9 @@ struct ompi_osc_pt2pt_module_t {
|
||||
/* Next incoming buffer count at which we want a signal on cond */
|
||||
uint32_t active_incoming_frag_signal_count;
|
||||
|
||||
uint32_t *passive_incoming_frag_count;
|
||||
uint32_t *passive_incoming_frag_signal_count;
|
||||
|
||||
/* Number of flush ack requests send since beginning of time */
|
||||
uint64_t flush_ack_requested_count;
|
||||
/* Number of flush ack replies received since beginning of
|
||||
@ -170,6 +171,8 @@ struct ompi_osc_pt2pt_module_t {
|
||||
/** Indicates the window is in an all access epoch (fence, lock_all) */
|
||||
bool all_access_epoch;
|
||||
|
||||
bool *passive_eager_send_active;
|
||||
|
||||
/* ********************* PWSC data ************************ */
|
||||
struct ompi_group_t *pw_group;
|
||||
struct ompi_group_t *sc_group;
|
||||
@ -186,11 +189,9 @@ struct ompi_osc_pt2pt_module_t {
|
||||
|
||||
/** Status of the local window lock. One of 0 (unlocked),
|
||||
MPI_LOCK_EXCLUSIVE, or MPI_LOCK_SHARED. */
|
||||
int32_t lock_status;
|
||||
|
||||
/** lock for locks_pending list */
|
||||
opal_mutex_t locks_pending_lock;
|
||||
|
||||
int lock_status;
|
||||
/** number of peers who hold a shared lock on the local window */
|
||||
int32_t shared_count;
|
||||
/** target side list of lock requests we couldn't satisfy yet */
|
||||
opal_list_t locks_pending;
|
||||
|
||||
@ -209,38 +210,29 @@ struct ompi_osc_pt2pt_module_t {
|
||||
/* enforce pscw matching */
|
||||
/** list of unmatched post messages */
|
||||
opal_list_t pending_posts;
|
||||
|
||||
/** Lock for garbage collection lists */
|
||||
opal_mutex_t gc_lock;
|
||||
|
||||
/** List of requests that need to be freed */
|
||||
opal_list_t request_gc;
|
||||
|
||||
/** List of buffers that need to be freed */
|
||||
opal_list_t buffer_gc;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_t;
|
||||
OMPI_MODULE_DECLSPEC extern ompi_osc_pt2pt_component_t mca_osc_pt2pt_component;
|
||||
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
|
||||
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
|
||||
|
||||
struct ompi_osc_pt2pt_pending_t {
|
||||
struct ompi_osc_rdma_pending_t {
|
||||
opal_list_item_t super;
|
||||
ompi_osc_pt2pt_module_t *module;
|
||||
ompi_osc_rdma_module_t *module;
|
||||
int source;
|
||||
ompi_osc_pt2pt_header_t header;
|
||||
ompi_osc_rdma_header_t header;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_pending_t ompi_osc_pt2pt_pending_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_t);
|
||||
typedef struct ompi_osc_rdma_pending_t ompi_osc_rdma_pending_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_t);
|
||||
|
||||
#define GET_MODULE(win) ((ompi_osc_pt2pt_module_t*) win->w_osc_module)
|
||||
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
|
||||
|
||||
extern bool ompi_osc_pt2pt_no_locks;
|
||||
extern bool ompi_osc_rdma_no_locks;
|
||||
|
||||
int ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len);
|
||||
int ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base);
|
||||
int ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len);
|
||||
int ompi_osc_rdma_detach(struct ompi_win_t *win, void *base);
|
||||
|
||||
int ompi_osc_pt2pt_free(struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_free(struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_put(void *origin_addr,
|
||||
int ompi_osc_rdma_put(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target,
|
||||
@ -249,7 +241,7 @@ int ompi_osc_pt2pt_put(void *origin_addr,
|
||||
struct ompi_datatype_t *target_dt,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_accumulate(void *origin_addr,
|
||||
int ompi_osc_rdma_accumulate(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target,
|
||||
@ -259,7 +251,7 @@ int ompi_osc_pt2pt_accumulate(void *origin_addr,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_get(void *origin_addr,
|
||||
int ompi_osc_rdma_get(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target,
|
||||
@ -268,7 +260,7 @@ int ompi_osc_pt2pt_get(void *origin_addr,
|
||||
struct ompi_datatype_t *target_dt,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_compare_and_swap(void *origin_addr,
|
||||
int ompi_osc_rdma_compare_and_swap(void *origin_addr,
|
||||
void *compare_addr,
|
||||
void *result_addr,
|
||||
struct ompi_datatype_t *dt,
|
||||
@ -276,7 +268,7 @@ int ompi_osc_pt2pt_compare_and_swap(void *origin_addr,
|
||||
OPAL_PTRDIFF_TYPE target_disp,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_fetch_and_op(void *origin_addr,
|
||||
int ompi_osc_rdma_fetch_and_op(void *origin_addr,
|
||||
void *result_addr,
|
||||
struct ompi_datatype_t *dt,
|
||||
int target,
|
||||
@ -284,7 +276,7 @@ int ompi_osc_pt2pt_fetch_and_op(void *origin_addr,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_get_accumulate(void *origin_addr,
|
||||
int ompi_osc_rdma_get_accumulate(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype,
|
||||
void *result_addr,
|
||||
@ -297,7 +289,7 @@ int ompi_osc_pt2pt_get_accumulate(void *origin_addr,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_rput(void *origin_addr,
|
||||
int ompi_osc_rdma_rput(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target,
|
||||
@ -307,7 +299,7 @@ int ompi_osc_pt2pt_rput(void *origin_addr,
|
||||
struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
int ompi_osc_pt2pt_rget(void *origin_addr,
|
||||
int ompi_osc_rdma_rget(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target,
|
||||
@ -317,7 +309,7 @@ int ompi_osc_pt2pt_rget(void *origin_addr,
|
||||
struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
int ompi_osc_pt2pt_raccumulate(void *origin_addr,
|
||||
int ompi_osc_rdma_raccumulate(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target,
|
||||
@ -328,7 +320,7 @@ int ompi_osc_pt2pt_raccumulate(void *origin_addr,
|
||||
struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
int ompi_osc_pt2pt_rget_accumulate(void *origin_addr,
|
||||
int ompi_osc_rdma_rget_accumulate(void *origin_addr,
|
||||
int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype,
|
||||
void *result_addr,
|
||||
@ -342,51 +334,51 @@ int ompi_osc_pt2pt_rget_accumulate(void *origin_addr,
|
||||
struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
int ompi_osc_pt2pt_fence(int assert, struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_fence(int assert, struct ompi_win_t *win);
|
||||
|
||||
/* received a post message */
|
||||
int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source);
|
||||
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source);
|
||||
|
||||
int ompi_osc_pt2pt_start(struct ompi_group_t *group,
|
||||
int ompi_osc_rdma_start(struct ompi_group_t *group,
|
||||
int assert,
|
||||
struct ompi_win_t *win);
|
||||
int ompi_osc_pt2pt_complete(struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_complete(struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_post(struct ompi_group_t *group,
|
||||
int ompi_osc_rdma_post(struct ompi_group_t *group,
|
||||
int assert,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_wait(struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_wait(struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_test(struct ompi_win_t *win,
|
||||
int ompi_osc_rdma_test(struct ompi_win_t *win,
|
||||
int *flag);
|
||||
|
||||
int ompi_osc_pt2pt_lock(int lock_type,
|
||||
int ompi_osc_rdma_lock(int lock_type,
|
||||
int target,
|
||||
int assert,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_unlock(int target,
|
||||
int ompi_osc_rdma_unlock(int target,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_lock_all(int assert,
|
||||
int ompi_osc_rdma_lock_all(int assert,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_unlock_all(struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_unlock_all(struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_sync(struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_sync(struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_flush(int target,
|
||||
int ompi_osc_rdma_flush(int target,
|
||||
struct ompi_win_t *win);
|
||||
int ompi_osc_pt2pt_flush_all(struct ompi_win_t *win);
|
||||
int ompi_osc_pt2pt_flush_local(int target,
|
||||
int ompi_osc_rdma_flush_all(struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_flush_local(int target,
|
||||
struct ompi_win_t *win);
|
||||
int ompi_osc_pt2pt_flush_local_all(struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_flush_local_all(struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info);
|
||||
int ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used);
|
||||
int ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info);
|
||||
int ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used);
|
||||
|
||||
int ompi_osc_pt2pt_component_irecv(ompi_osc_pt2pt_module_t *module,
|
||||
int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module,
|
||||
void *buf,
|
||||
size_t count,
|
||||
struct ompi_datatype_t *datatype,
|
||||
@ -394,7 +386,7 @@ int ompi_osc_pt2pt_component_irecv(ompi_osc_pt2pt_module_t *module,
|
||||
int tag,
|
||||
struct ompi_communicator_t *comm);
|
||||
|
||||
int ompi_osc_pt2pt_component_isend(ompi_osc_pt2pt_module_t *module,
|
||||
int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module,
|
||||
void *buf,
|
||||
size_t count,
|
||||
struct ompi_datatype_t *datatype,
|
||||
@ -403,16 +395,16 @@ int ompi_osc_pt2pt_component_isend(ompi_osc_pt2pt_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_progress_pending_acc:
|
||||
* ompi_osc_rdma_progress_pending_acc:
|
||||
*
|
||||
* @short Progress one pending accumulation or compare and swap operation.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
*
|
||||
* @long If the accumulation lock can be aquired progress one pending
|
||||
* accumulate or compare and swap operation.
|
||||
*/
|
||||
int ompi_osc_pt2pt_progress_pending_acc (ompi_osc_pt2pt_module_t *module);
|
||||
int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
|
||||
|
||||
|
||||
/**
|
||||
@ -420,7 +412,7 @@ int ompi_osc_pt2pt_progress_pending_acc (ompi_osc_pt2pt_module_t *module);
|
||||
*
|
||||
* @short Increment incoming completeion count.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] source - Passive target source or MPI_PROC_NULL (active target)
|
||||
*
|
||||
* @long This function incremements either the passive or active incoming counts.
|
||||
@ -428,7 +420,7 @@ int ompi_osc_pt2pt_progress_pending_acc (ompi_osc_pt2pt_module_t *module);
|
||||
* This function uses atomics if necessary so it is not necessary to hold
|
||||
* the module lock before calling this function.
|
||||
*/
|
||||
static inline void mark_incoming_completion (ompi_osc_pt2pt_module_t *module, int source)
|
||||
static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int source)
|
||||
{
|
||||
if (MPI_PROC_NULL == source) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
@ -439,12 +431,11 @@ static inline void mark_incoming_completion (ompi_osc_pt2pt_module_t *module, in
|
||||
opal_condition_broadcast(&module->cond);
|
||||
}
|
||||
} else {
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"mark_incoming_completion marking passive incoming complete. source = %d, count = %d",
|
||||
source, (int) peer->passive_incoming_frag_count + 1));
|
||||
OPAL_THREAD_ADD32((int32_t *) &peer->passive_incoming_frag_count, 1);
|
||||
if (0 == peer->passive_incoming_frag_count) {
|
||||
source, (int) module->passive_incoming_frag_count[source] + 1));
|
||||
OPAL_THREAD_ADD32((int32_t *) (module->passive_incoming_frag_count + source), 1);
|
||||
if (module->passive_incoming_frag_count[source] >= module->passive_incoming_frag_signal_count[source]) {
|
||||
opal_condition_broadcast(&module->cond);
|
||||
}
|
||||
}
|
||||
@ -455,7 +446,7 @@ static inline void mark_incoming_completion (ompi_osc_pt2pt_module_t *module, in
|
||||
*
|
||||
* @short Increment outgoing count.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
*
|
||||
* @long This function is used to signal that an outgoing send is complete. It
|
||||
* incrememnts only the outgoing fragment count and signals the module
|
||||
@ -463,7 +454,7 @@ static inline void mark_incoming_completion (ompi_osc_pt2pt_module_t *module, in
|
||||
* uses atomics if necessary so it is not necessary to hold the module
|
||||
* lock before calling this function.
|
||||
*/
|
||||
static inline void mark_outgoing_completion (ompi_osc_pt2pt_module_t *module)
|
||||
static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_count, 1);
|
||||
if (module->outgoing_frag_count >= module->outgoing_frag_signal_count) {
|
||||
@ -476,14 +467,14 @@ static inline void mark_outgoing_completion (ompi_osc_pt2pt_module_t *module)
|
||||
*
|
||||
* @short Increment outgoing signal counters.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] target - Passive target rank or MPI_PROC_NULL (active target)
|
||||
* @param[in] count - Number of outgoing messages to signal.
|
||||
*
|
||||
* @long This function uses atomics if necessary so it is not necessary to hold
|
||||
* the module lock before calling this function.
|
||||
*/
|
||||
static inline void ompi_osc_signal_outgoing (ompi_osc_pt2pt_module_t *module, int target, int count)
|
||||
static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int target, int count)
|
||||
{
|
||||
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_signal_count, count);
|
||||
if (MPI_PROC_NULL != target) {
|
||||
@ -495,7 +486,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_pt2pt_module_t *module, in
|
||||
}
|
||||
|
||||
/**
|
||||
* osc_pt2pt_copy_on_recv:
|
||||
* osc_rdma_copy_on_recv:
|
||||
*
|
||||
* @short Helper function. Copies data from source to target through the
|
||||
* convertor.
|
||||
@ -511,7 +502,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_pt2pt_module_t *module, in
|
||||
* buffer. The copy is done with a convertor generated from proc,
|
||||
* datatype, and count.
|
||||
*/
|
||||
static inline void osc_pt2pt_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc,
|
||||
static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc,
|
||||
int count, ompi_datatype_t *datatype)
|
||||
{
|
||||
opal_convertor_t convertor;
|
||||
@ -539,7 +530,7 @@ static inline void osc_pt2pt_copy_on_recv (void *target, void *source, size_t so
|
||||
}
|
||||
|
||||
/**
|
||||
* osc_pt2pt_copy_for_send:
|
||||
* osc_rdma_copy_for_send:
|
||||
*
|
||||
* @short: Helper function. Copies data from source to target through the
|
||||
* convertor.
|
||||
@ -555,7 +546,7 @@ static inline void osc_pt2pt_copy_on_recv (void *target, void *source, size_t so
|
||||
* buffer. The copy is done with a convertor generated from proc,
|
||||
* datatype, and count.
|
||||
*/
|
||||
static inline void osc_pt2pt_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc,
|
||||
static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc,
|
||||
int count, ompi_datatype_t *datatype)
|
||||
{
|
||||
opal_convertor_t convertor;
|
||||
@ -576,7 +567,7 @@ static inline void osc_pt2pt_copy_for_send (void *target, size_t target_len, voi
|
||||
}
|
||||
|
||||
/**
|
||||
* osc_pt2pt_request_gc_clean:
|
||||
* osc_rdma_request_gc_clean:
|
||||
*
|
||||
* @short Release finished PML requests and accumulate buffers.
|
||||
*
|
||||
@ -585,77 +576,71 @@ static inline void osc_pt2pt_copy_for_send (void *target, size_t target_len, voi
|
||||
* and buffers on the module's garbage collection lists and release then
|
||||
* at a later time.
|
||||
*/
|
||||
static inline void osc_pt2pt_gc_clean (ompi_osc_pt2pt_module_t *module)
|
||||
static inline void osc_rdma_gc_clean (void)
|
||||
{
|
||||
ompi_request_t *request;
|
||||
opal_list_item_t *item;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->gc_lock);
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
|
||||
while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&module->request_gc))) {
|
||||
OPAL_THREAD_UNLOCK(&module->gc_lock);
|
||||
while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&mca_osc_rdma_component.request_gc))) {
|
||||
ompi_request_free (&request);
|
||||
OPAL_THREAD_LOCK(&module->gc_lock);
|
||||
}
|
||||
|
||||
while (NULL != (item = opal_list_remove_first (&module->buffer_gc))) {
|
||||
while (NULL != (item = opal_list_remove_first (&mca_osc_rdma_component.buffer_gc))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->gc_lock);
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
}
|
||||
|
||||
static inline void osc_pt2pt_gc_add_request (ompi_osc_pt2pt_module_t *module, ompi_request_t *request)
|
||||
static inline void osc_rdma_gc_add_request (ompi_request_t *request)
|
||||
{
|
||||
OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
|
||||
opal_list_append (&module->request_gc, (opal_list_item_t *) request));
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
opal_list_append (&mca_osc_rdma_component.request_gc, (opal_list_item_t *) request);
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
}
|
||||
|
||||
static inline void osc_pt2pt_gc_add_buffer (ompi_osc_pt2pt_module_t *module, opal_list_item_t *buffer)
|
||||
static inline void osc_rdma_gc_add_buffer (opal_list_item_t *buffer)
|
||||
{
|
||||
OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
|
||||
opal_list_append (&module->buffer_gc, buffer));
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
opal_list_append (&mca_osc_rdma_component.buffer_gc, buffer);
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
}
|
||||
|
||||
static inline void osc_pt2pt_add_pending (ompi_osc_pt2pt_pending_t *pending)
|
||||
{
|
||||
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.pending_operations_lock,
|
||||
opal_list_append (&mca_osc_pt2pt_component.pending_operations, &pending->super));
|
||||
}
|
||||
|
||||
#define OSC_PT2PT_FRAG_TAG 0x10000
|
||||
#define OSC_PT2PT_FRAG_MASK 0x0ffff
|
||||
#define OSC_RDMA_FRAG_TAG 0x10000
|
||||
#define OSC_RDMA_FRAG_MASK 0x0ffff
|
||||
|
||||
/**
|
||||
* get_tag:
|
||||
*
|
||||
* @short Get a send/recv tag for large memory operations.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
*
|
||||
* @long This function aquires a 16-bit tag for use with large memory operations. The
|
||||
* tag will be odd or even depending on if this is in a passive target access
|
||||
* or not.
|
||||
*/
|
||||
static inline int get_tag(ompi_osc_pt2pt_module_t *module)
|
||||
static inline int get_tag(ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
/* the LSB of the tag is used be the receiver to determine if the
|
||||
message is a passive or active target (ie, where to mark
|
||||
completion). */
|
||||
int tmp = module->tag_counter + !!(module->passive_target_access_epoch);
|
||||
|
||||
module->tag_counter = (module->tag_counter + 2) & OSC_PT2PT_FRAG_MASK;
|
||||
module->tag_counter = (module->tag_counter + 2) & OSC_RDMA_FRAG_MASK;
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_accumulate_lock:
|
||||
* ompi_osc_rdma_accumulate_lock:
|
||||
*
|
||||
* @short Internal function that spins until the accumulation lock has
|
||||
* been aquired.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
*
|
||||
* @returns 0
|
||||
*
|
||||
@ -663,9 +648,9 @@ static inline int get_tag(ompi_osc_pt2pt_module_t *module)
|
||||
* behavior is only acceptable from a user-level call as blocking in a
|
||||
* callback may cause deadlock. If a callback needs the accumulate lock and
|
||||
* it is not available it should be placed on the pending_acc list of the
|
||||
* module. It will be released by ompi_osc_pt2pt_accumulate_unlock().
|
||||
* module. It will be released by ompi_osc_rdma_accumulate_unlock().
|
||||
*/
|
||||
static inline int ompi_osc_pt2pt_accumulate_lock (ompi_osc_pt2pt_module_t *module)
|
||||
static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
while (opal_atomic_trylock (&module->accumulate_lock)) {
|
||||
opal_progress ();
|
||||
@ -675,11 +660,11 @@ static inline int ompi_osc_pt2pt_accumulate_lock (ompi_osc_pt2pt_module_t *modul
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_accumulate_trylock:
|
||||
* ompi_osc_rdma_accumulate_trylock:
|
||||
*
|
||||
* @short Try to aquire the accumulation lock.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
*
|
||||
* @returns 0 if the accumulation lock was aquired
|
||||
* @returns 1 if the lock was not available
|
||||
@ -687,34 +672,34 @@ static inline int ompi_osc_pt2pt_accumulate_lock (ompi_osc_pt2pt_module_t *modul
|
||||
* @long This function will try to aquire the accumulation lock. This function
|
||||
* is safe to call from a callback.
|
||||
*/
|
||||
static inline int ompi_osc_pt2pt_accumulate_trylock (ompi_osc_pt2pt_module_t *module)
|
||||
static inline int ompi_osc_rdma_accumulate_trylock (ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
return opal_atomic_trylock (&module->accumulate_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_accumulate_unlock:
|
||||
* ompi_osc_rdma_accumulate_unlock:
|
||||
*
|
||||
* @short Unlock the accumulation lock and release a pending accumulation operation.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
*
|
||||
* @long This function unlocks the accumulation lock and release a single pending
|
||||
* accumulation operation if one exists. This function may be called recursively.
|
||||
*/
|
||||
static inline void ompi_osc_pt2pt_accumulate_unlock (ompi_osc_pt2pt_module_t *module)
|
||||
static inline void ompi_osc_rdma_accumulate_unlock (ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
opal_atomic_unlock (&module->accumulate_lock);
|
||||
if (0 != opal_list_get_size (&module->pending_acc)) {
|
||||
ompi_osc_pt2pt_progress_pending_acc (module);
|
||||
ompi_osc_rdma_progress_pending_acc (module);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool ompi_osc_pt2pt_check_access_epoch (ompi_osc_pt2pt_module_t *module, int rank)
|
||||
static inline bool ompi_osc_rdma_check_access_epoch (ompi_osc_rdma_module_t *module, int rank)
|
||||
{
|
||||
return module->all_access_epoch || module->peers[rank].access_epoch;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OMPI_OSC_PT2PT_H */
|
||||
#endif /* OMPI_OSC_RDMA_H */
|
@ -21,10 +21,10 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "osc_pt2pt.h"
|
||||
#include "osc_pt2pt_header.h"
|
||||
#include "osc_pt2pt_data_move.h"
|
||||
#include "osc_pt2pt_frag.h"
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_header.h"
|
||||
#include "osc_rdma_data_move.h"
|
||||
#include "osc_rdma_frag.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
@ -33,19 +33,19 @@
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_pending_post_t:
|
||||
* ompi_osc_rdma_pending_post_t:
|
||||
*
|
||||
* Describes a post operation that was encountered outside its
|
||||
* matching start operation.
|
||||
*/
|
||||
struct ompi_osc_pt2pt_pending_post_t {
|
||||
struct ompi_osc_rdma_pending_post_t {
|
||||
opal_list_item_t super;
|
||||
int rank;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_pending_post_t ompi_osc_pt2pt_pending_post_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_post_t);
|
||||
typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_post_t);
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_post_t, opal_list_item_t, NULL, NULL);
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);
|
||||
|
||||
static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
|
||||
{
|
||||
@ -64,7 +64,7 @@ static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
|
||||
}
|
||||
|
||||
static int*
|
||||
get_comm_ranks(ompi_osc_pt2pt_module_t *module,
|
||||
get_comm_ranks(ompi_osc_rdma_module_t *module,
|
||||
ompi_group_t *sub_group)
|
||||
{
|
||||
int *ranks1 = NULL, *ranks2 = NULL;
|
||||
@ -100,14 +100,14 @@ get_comm_ranks(ompi_osc_pt2pt_module_t *module,
|
||||
}
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_fence(int assert, ompi_win_t *win)
|
||||
ompi_osc_rdma_fence(int assert, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
uint32_t incoming_reqs;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: fence start"));
|
||||
"osc rdma: fence start"));
|
||||
|
||||
/* can't enter an active target epoch when in a passive target epoch */
|
||||
if (module->passive_target_access_epoch) {
|
||||
@ -122,36 +122,34 @@ ompi_osc_pt2pt_fence(int assert, ompi_win_t *win)
|
||||
|
||||
/* short-circuit the noprecede case */
|
||||
if (0 != (assert & MPI_MODE_NOPRECEDE)) {
|
||||
ret = module->comm->c_coll.coll_barrier(module->comm,
|
||||
module->comm->c_coll.coll_barrier_module);
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: fence end (short circuit)"));
|
||||
"osc rdma: fence end (short circuit)"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* try to start all requests. */
|
||||
ret = ompi_osc_pt2pt_frag_flush_all(module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
/* try to start all the requests. */
|
||||
ret = ompi_osc_rdma_frag_flush_all(module);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: fence done sending"));
|
||||
"osc rdma: fence done sending"));
|
||||
|
||||
/* find out how much data everyone is going to send us. */
|
||||
ret = module->comm->c_coll.coll_reduce_scatter_block (module->epoch_outgoing_frag_count,
|
||||
&incoming_reqs, 1, MPI_UINT32_T,
|
||||
MPI_SUM, module->comm,
|
||||
module->comm->c_coll.coll_reduce_scatter_block_module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return ret;
|
||||
}
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
bzero(module->epoch_outgoing_frag_count,
|
||||
sizeof(uint32_t) * ompi_comm_size(module->comm));
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: fence expects %d requests",
|
||||
"osc rdma: fence expects %d requests",
|
||||
incoming_reqs));
|
||||
|
||||
/* set our complete condition for incoming requests */
|
||||
@ -163,31 +161,32 @@ ompi_osc_pt2pt_fence(int assert, ompi_win_t *win)
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
module->active_incoming_frag_signal_count = 0;
|
||||
ret = OMPI_SUCCESS;
|
||||
|
||||
if (assert & MPI_MODE_NOSUCCEED) {
|
||||
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
|
||||
* stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */
|
||||
module->active_eager_send_active = false;
|
||||
module->all_access_epoch = false;
|
||||
}
|
||||
opal_condition_broadcast (&module->cond);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: fence end: %d", ret));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc pt2pt: fence end: %d", ret));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_start(ompi_group_t *group,
|
||||
ompi_osc_rdma_start(ompi_group_t *group,
|
||||
int assert,
|
||||
ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_pt2pt_pending_post_t *pending_post, *next;
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_pending_post_t *pending_post, *next;
|
||||
int group_size;
|
||||
int *ranks;
|
||||
|
||||
@ -210,7 +209,7 @@ ompi_osc_pt2pt_start(ompi_group_t *group,
|
||||
group_size = ompi_group_size (module->sc_group);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_start entering with group size %d...",
|
||||
"ompi_osc_rdma_start entering with group size %d...",
|
||||
group_size));
|
||||
|
||||
ranks = get_comm_ranks(module, module->sc_group);
|
||||
@ -223,17 +222,13 @@ ompi_osc_pt2pt_start(ompi_group_t *group,
|
||||
|
||||
free (ranks);
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_pt2pt_pending_post_t) {
|
||||
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_rdma_pending_post_t) {
|
||||
ompi_proc_t *pending_proc = ompi_comm_peer_lookup (module->comm, pending_post->rank);
|
||||
|
||||
if (group_contains_proc (module->sc_group, pending_proc)) {
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + pending_post->rank;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Consumed unexpected post message from %d",
|
||||
pending_post->rank));
|
||||
++module->num_post_msgs;
|
||||
peer->eager_send_active = true;
|
||||
|
||||
opal_list_remove_item (&module->pending_posts, &pending_post->super);
|
||||
OBJ_RELEASE(pending_post);
|
||||
}
|
||||
@ -259,7 +254,7 @@ ompi_osc_pt2pt_start(ompi_group_t *group,
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_start complete"));
|
||||
"ompi_osc_rdma_start complete"));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_SUCCESS;
|
||||
@ -267,11 +262,11 @@ ompi_osc_pt2pt_start(ompi_group_t *group,
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_complete(ompi_win_t *win)
|
||||
ompi_osc_rdma_complete(ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_pt2pt_header_complete_t complete_req;
|
||||
ompi_osc_pt2pt_peer_t *peer;
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_header_complete_t complete_req;
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
int ret = OMPI_SUCCESS;
|
||||
int i;
|
||||
int *ranks = NULL;
|
||||
@ -279,7 +274,7 @@ ompi_osc_pt2pt_complete(ompi_win_t *win)
|
||||
int my_rank = ompi_comm_rank (module->comm);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_complete entering..."));
|
||||
"ompi_osc_rdma_complete entering..."));
|
||||
|
||||
if (NULL == module->sc_group) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
@ -296,10 +291,9 @@ ompi_osc_pt2pt_complete(ompi_win_t *win)
|
||||
"waiting for post messages. num_post_msgs = %d", module->num_post_msgs));
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_complete sending complete message"));
|
||||
"ompi_osc_rdma_complete sending complete message"));
|
||||
|
||||
/* for each process in group, send a control message with number
|
||||
of updates coming, then start all the requests. Note that the
|
||||
@ -308,39 +302,38 @@ ompi_osc_pt2pt_complete(ompi_win_t *win)
|
||||
|
||||
At the same time, clean out the outgoing count for the next
|
||||
round. */
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
|
||||
if (my_rank == ranks[i]) {
|
||||
/* shortcut for self */
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self complete"));
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self complete"));
|
||||
module->num_complete_msgs++;
|
||||
continue;
|
||||
}
|
||||
|
||||
complete_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE;
|
||||
complete_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
|
||||
complete_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_COMPLETE;
|
||||
complete_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
||||
complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]];
|
||||
|
||||
peer = module->peers + ranks[i];
|
||||
|
||||
peer->access_epoch = false;
|
||||
|
||||
ret = ompi_osc_pt2pt_control_send(module,
|
||||
ret = ompi_osc_rdma_control_send(module,
|
||||
ranks[i],
|
||||
&complete_req,
|
||||
sizeof(ompi_osc_pt2pt_header_complete_t));
|
||||
sizeof(ompi_osc_rdma_header_complete_t));
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
}
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* start all requests */
|
||||
ret = ompi_osc_pt2pt_frag_flush_all(module);
|
||||
ret = ompi_osc_rdma_frag_flush_all(module);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
/* zero the fragment counts here to ensure they are zerod */
|
||||
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
|
||||
peer = module->peers + ranks[i];
|
||||
module->epoch_outgoing_frag_count[ranks[i]] = 0;
|
||||
peer->eager_send_active = false;
|
||||
}
|
||||
|
||||
/* wait for outgoing requests to complete. Don't wait for incoming, as
|
||||
@ -354,14 +347,14 @@ ompi_osc_pt2pt_complete(ompi_win_t *win)
|
||||
module->sc_group = NULL;
|
||||
|
||||
/* unlock here, as group cleanup can take a while... */
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
OPAL_THREAD_UNLOCK(&(module->lock));
|
||||
|
||||
/* phase 2 cleanup group */
|
||||
ompi_group_decrement_proc_count(group);
|
||||
OBJ_RELEASE(group);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_complete complete"));
|
||||
"ompi_osc_rdma_complete complete"));
|
||||
free (ranks);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -369,19 +362,21 @@ ompi_osc_pt2pt_complete(ompi_win_t *win)
|
||||
cleanup:
|
||||
if (NULL != ranks) free(ranks);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&(module->lock));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_post(ompi_group_t *group,
|
||||
ompi_osc_rdma_post(ompi_group_t *group,
|
||||
int assert,
|
||||
ompi_win_t *win)
|
||||
{
|
||||
int *ranks;
|
||||
int ret = OMPI_SUCCESS;
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_pt2pt_header_post_t post_req;
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_header_post_t post_req;
|
||||
int my_rank = ompi_comm_rank(module->comm);
|
||||
|
||||
/* can't check for all access epoch here due to fence */
|
||||
@ -390,7 +385,7 @@ ompi_osc_pt2pt_post(ompi_group_t *group,
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_post entering with group size %d...",
|
||||
"ompi_osc_rdma_post entering with group size %d...",
|
||||
ompi_group_size (group)));
|
||||
|
||||
/* save the group */
|
||||
@ -427,19 +422,19 @@ ompi_osc_pt2pt_post(ompi_group_t *group,
|
||||
|
||||
/* shortcut for self */
|
||||
if (my_rank == ranks[i]) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self post"));
|
||||
osc_pt2pt_incoming_post (module, my_rank);
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self post"));
|
||||
osc_rdma_incoming_post (module, my_rank);
|
||||
continue;
|
||||
}
|
||||
|
||||
post_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_POST;
|
||||
post_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
|
||||
post_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_POST;
|
||||
post_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
||||
post_req.windx = ompi_comm_get_cid(module->comm);
|
||||
|
||||
/* we don't want to send any data, since we're the exposure
|
||||
epoch only, so use an unbuffered send */
|
||||
ret = ompi_osc_pt2pt_control_send_unbuffered(module, ranks[i], &post_req,
|
||||
sizeof(ompi_osc_pt2pt_header_post_t));
|
||||
ret = ompi_osc_rdma_control_send_unbuffered(module, ranks[i], &post_req,
|
||||
sizeof(ompi_osc_rdma_header_post_t));
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
break;
|
||||
}
|
||||
@ -452,9 +447,9 @@ ompi_osc_pt2pt_post(ompi_group_t *group,
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_wait(ompi_win_t *win)
|
||||
ompi_osc_rdma_wait(ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_group_t *group;
|
||||
|
||||
if (NULL == module->pw_group) {
|
||||
@ -462,7 +457,7 @@ ompi_osc_pt2pt_wait(ompi_win_t *win)
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_wait entering..."));
|
||||
"ompi_osc_rdma_wait entering..."));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
while (0 != module->num_complete_msgs ||
|
||||
@ -481,17 +476,17 @@ ompi_osc_pt2pt_wait(ompi_win_t *win)
|
||||
OBJ_RELEASE(group);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_pt2pt_wait complete"));
|
||||
"ompi_osc_rdma_wait complete"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_test(ompi_win_t *win,
|
||||
ompi_osc_rdma_test(ompi_win_t *win,
|
||||
int *flag)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_group_t *group;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
@ -509,6 +504,7 @@ ompi_osc_pt2pt_test(ompi_win_t *win,
|
||||
module->active_incoming_frag_count != module->active_incoming_frag_signal_count) {
|
||||
*flag = 0;
|
||||
ret = OMPI_SUCCESS;
|
||||
goto cleanup;
|
||||
} else {
|
||||
*flag = 1;
|
||||
|
||||
@ -523,21 +519,21 @@ ompi_osc_pt2pt_test(ompi_win_t *win,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&(module->lock));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source)
|
||||
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source)
|
||||
{
|
||||
ompi_proc_t *source_proc = ompi_comm_peer_lookup (module->comm, source);
|
||||
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* verify that this proc is part of the current start group */
|
||||
if (!module->sc_group || !group_contains_proc (module->sc_group, source_proc)) {
|
||||
ompi_osc_pt2pt_pending_post_t *pending_post = OBJ_NEW(ompi_osc_pt2pt_pending_post_t);
|
||||
ompi_osc_rdma_pending_post_t *pending_post = OBJ_NEW(ompi_osc_rdma_pending_post_t);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"received unexpected post message from %d. module->sc_group = %p, size = %d",
|
||||
@ -551,9 +547,6 @@ int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
assert (!peer->eager_send_active);
|
||||
peer->eager_send_active = true;
|
||||
|
||||
module->num_post_msgs++;
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"received post message. num_post_msgs = %d", module->num_post_msgs));
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -25,10 +25,10 @@
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "osc_pt2pt.h"
|
||||
#include "osc_pt2pt_data_move.h"
|
||||
#include "osc_pt2pt_frag.h"
|
||||
#include "osc_pt2pt_request.h"
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_data_move.h"
|
||||
#include "osc_rdma_frag.h"
|
||||
#include "osc_rdma_request.h"
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
@ -55,11 +55,11 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
struct ompi_communicator_t *comm, struct ompi_info_t *info,
|
||||
int flavor, int *model);
|
||||
|
||||
ompi_osc_pt2pt_component_t mca_osc_pt2pt_component = {
|
||||
ompi_osc_rdma_component_t mca_osc_rdma_component = {
|
||||
{ /* ompi_osc_base_component_t */
|
||||
{ /* ompi_base_component_t */
|
||||
OMPI_OSC_BASE_VERSION_3_0_0,
|
||||
"pt2pt",
|
||||
"rdma",
|
||||
OMPI_MAJOR_VERSION, /* MCA component major version */
|
||||
OMPI_MINOR_VERSION, /* MCA component minor version */
|
||||
OMPI_RELEASE_VERSION, /* MCA component release version */
|
||||
@ -80,51 +80,51 @@ ompi_osc_pt2pt_component_t mca_osc_pt2pt_component = {
|
||||
};
|
||||
|
||||
|
||||
ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_template = {
|
||||
ompi_osc_rdma_module_t ompi_osc_rdma_module_template = {
|
||||
{
|
||||
NULL, /* shared_query */
|
||||
|
||||
ompi_osc_pt2pt_attach,
|
||||
ompi_osc_pt2pt_detach,
|
||||
ompi_osc_pt2pt_free,
|
||||
ompi_osc_rdma_attach,
|
||||
ompi_osc_rdma_detach,
|
||||
ompi_osc_rdma_free,
|
||||
|
||||
ompi_osc_pt2pt_put,
|
||||
ompi_osc_pt2pt_get,
|
||||
ompi_osc_pt2pt_accumulate,
|
||||
ompi_osc_pt2pt_compare_and_swap,
|
||||
ompi_osc_pt2pt_fetch_and_op,
|
||||
ompi_osc_pt2pt_get_accumulate,
|
||||
ompi_osc_rdma_put,
|
||||
ompi_osc_rdma_get,
|
||||
ompi_osc_rdma_accumulate,
|
||||
ompi_osc_rdma_compare_and_swap,
|
||||
ompi_osc_rdma_fetch_and_op,
|
||||
ompi_osc_rdma_get_accumulate,
|
||||
|
||||
ompi_osc_pt2pt_rput,
|
||||
ompi_osc_pt2pt_rget,
|
||||
ompi_osc_pt2pt_raccumulate,
|
||||
ompi_osc_pt2pt_rget_accumulate,
|
||||
ompi_osc_rdma_rput,
|
||||
ompi_osc_rdma_rget,
|
||||
ompi_osc_rdma_raccumulate,
|
||||
ompi_osc_rdma_rget_accumulate,
|
||||
|
||||
ompi_osc_pt2pt_fence,
|
||||
ompi_osc_rdma_fence,
|
||||
|
||||
ompi_osc_pt2pt_start,
|
||||
ompi_osc_pt2pt_complete,
|
||||
ompi_osc_pt2pt_post,
|
||||
ompi_osc_pt2pt_wait,
|
||||
ompi_osc_pt2pt_test,
|
||||
ompi_osc_rdma_start,
|
||||
ompi_osc_rdma_complete,
|
||||
ompi_osc_rdma_post,
|
||||
ompi_osc_rdma_wait,
|
||||
ompi_osc_rdma_test,
|
||||
|
||||
ompi_osc_pt2pt_lock,
|
||||
ompi_osc_pt2pt_unlock,
|
||||
ompi_osc_pt2pt_lock_all,
|
||||
ompi_osc_pt2pt_unlock_all,
|
||||
ompi_osc_rdma_lock,
|
||||
ompi_osc_rdma_unlock,
|
||||
ompi_osc_rdma_lock_all,
|
||||
ompi_osc_rdma_unlock_all,
|
||||
|
||||
ompi_osc_pt2pt_sync,
|
||||
ompi_osc_pt2pt_flush,
|
||||
ompi_osc_pt2pt_flush_all,
|
||||
ompi_osc_pt2pt_flush_local,
|
||||
ompi_osc_pt2pt_flush_local_all,
|
||||
ompi_osc_rdma_sync,
|
||||
ompi_osc_rdma_flush,
|
||||
ompi_osc_rdma_flush_all,
|
||||
ompi_osc_rdma_flush_local,
|
||||
ompi_osc_rdma_flush_local_all,
|
||||
|
||||
ompi_osc_pt2pt_set_info,
|
||||
ompi_osc_pt2pt_get_info
|
||||
ompi_osc_rdma_set_info,
|
||||
ompi_osc_rdma_get_info
|
||||
}
|
||||
};
|
||||
|
||||
bool ompi_osc_pt2pt_no_locks;
|
||||
bool ompi_osc_rdma_no_locks;
|
||||
|
||||
/* look up parameters for configuring this window. The code first
|
||||
looks in the info structure passed by the user, then through mca
|
||||
@ -157,7 +157,7 @@ check_config_value_bool(char *key, ompi_info_t *info)
|
||||
return result;
|
||||
|
||||
info_not_found:
|
||||
param = mca_base_var_find("ompi", "osc", "pt2pt", key);
|
||||
param = mca_base_var_find("ompi", "osc", "rdma", key);
|
||||
if (0 > param) return false;
|
||||
|
||||
ret = mca_base_var_get_value(param, &flag_value, NULL, NULL);
|
||||
@ -177,8 +177,8 @@ component_open(void)
|
||||
static int
|
||||
component_register(void)
|
||||
{
|
||||
ompi_osc_pt2pt_no_locks = false;
|
||||
(void) mca_base_component_var_register(&mca_osc_pt2pt_component.super.osc_version,
|
||||
ompi_osc_rdma_no_locks = false;
|
||||
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version,
|
||||
"no_locks",
|
||||
"Enable optimizations available only if MPI_LOCK is "
|
||||
"not used. "
|
||||
@ -186,39 +186,38 @@ component_register(void)
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_osc_pt2pt_no_locks);
|
||||
&ompi_osc_rdma_no_locks);
|
||||
|
||||
mca_osc_pt2pt_component.buffer_size = 8192;
|
||||
(void) mca_base_component_var_register (&mca_osc_pt2pt_component.super.osc_version, "buffer_size",
|
||||
mca_osc_rdma_component.buffer_size = 8192;
|
||||
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size",
|
||||
"Data transfers smaller than this limit may be coalesced before "
|
||||
"being transferred (default: 8k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_osc_pt2pt_component.buffer_size);
|
||||
&mca_osc_rdma_component.buffer_size);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int component_progress (void)
|
||||
{
|
||||
int count = opal_list_get_size (&mca_osc_pt2pt_component.pending_operations);
|
||||
ompi_osc_pt2pt_pending_t *pending, *next;
|
||||
ompi_osc_rdma_pending_t *pending, *next;
|
||||
|
||||
if (0 == count) {
|
||||
if (0 == opal_list_get_size (&mca_osc_rdma_component.pending_operations)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* process one incoming request */
|
||||
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_operations_lock);
|
||||
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_pt2pt_component.pending_operations, ompi_osc_pt2pt_pending_t) {
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_rdma_component.pending_operations, ompi_osc_rdma_pending_t) {
|
||||
int ret;
|
||||
|
||||
switch (pending->header.base.type) {
|
||||
case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ:
|
||||
ret = ompi_osc_pt2pt_process_flush (pending->module, pending->source,
|
||||
case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ:
|
||||
ret = ompi_osc_rdma_process_flush (pending->module, pending->source,
|
||||
&pending->header.flush);
|
||||
break;
|
||||
case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ:
|
||||
ret = ompi_osc_pt2pt_process_unlock (pending->module, pending->source,
|
||||
case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ:
|
||||
ret = ompi_osc_rdma_process_unlock (pending->module, pending->source,
|
||||
&pending->header.unlock);
|
||||
break;
|
||||
default:
|
||||
@ -228,11 +227,11 @@ static int component_progress (void)
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
opal_list_remove_item (&mca_osc_pt2pt_component.pending_operations, &pending->super);
|
||||
opal_list_remove_item (&mca_osc_rdma_component.pending_operations, &pending->super);
|
||||
OBJ_RELEASE(pending);
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_operations_lock);
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -243,24 +242,23 @@ component_init(bool enable_progress_threads,
|
||||
{
|
||||
int ret;
|
||||
|
||||
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_osc_rdma_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_osc_rdma_component.pending_operations, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_osc_rdma_component.request_gc, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_osc_rdma_component.buffer_gc, opal_list_t);
|
||||
|
||||
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.modules,
|
||||
OBJ_CONSTRUCT(&mca_osc_rdma_component.modules,
|
||||
opal_hash_table_t);
|
||||
opal_hash_table_init(&mca_osc_pt2pt_component.modules, 2);
|
||||
opal_hash_table_init(&mca_osc_rdma_component.modules, 2);
|
||||
|
||||
mca_osc_pt2pt_component.progress_enable = false;
|
||||
mca_osc_pt2pt_component.module_count = 0;
|
||||
mca_osc_rdma_component.progress_enable = false;
|
||||
mca_osc_rdma_component.module_count = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.frags, ompi_free_list_t);
|
||||
ret = ompi_free_list_init_new (&mca_osc_pt2pt_component.frags,
|
||||
sizeof(ompi_osc_pt2pt_frag_t), 8,
|
||||
OBJ_CLASS(ompi_osc_pt2pt_frag_t),
|
||||
mca_osc_pt2pt_component.buffer_size +
|
||||
sizeof (ompi_osc_pt2pt_frag_header_t),
|
||||
8, 1, -1, 1, 0);
|
||||
OBJ_CONSTRUCT(&mca_osc_rdma_component.frags, opal_free_list_t);
|
||||
ret = opal_free_list_init(&mca_osc_rdma_component.frags,
|
||||
sizeof(ompi_osc_rdma_frag_t),
|
||||
OBJ_CLASS(ompi_osc_rdma_frag_t),
|
||||
1, -1, 1);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
|
||||
"%s:%d: ompi_free_list_init failed: %d",
|
||||
@ -268,10 +266,10 @@ component_init(bool enable_progress_threads,
|
||||
return ret;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.requests, ompi_free_list_t);
|
||||
ret = ompi_free_list_init(&mca_osc_pt2pt_component.requests,
|
||||
sizeof(ompi_osc_pt2pt_request_t),
|
||||
OBJ_CLASS(ompi_osc_pt2pt_request_t),
|
||||
OBJ_CONSTRUCT(&mca_osc_rdma_component.requests, ompi_free_list_t);
|
||||
ret = ompi_free_list_init(&mca_osc_rdma_component.requests,
|
||||
sizeof(ompi_osc_rdma_request_t),
|
||||
OBJ_CLASS(ompi_osc_rdma_request_t),
|
||||
0, -1, 32, NULL);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
|
||||
@ -289,23 +287,24 @@ component_finalize(void)
|
||||
{
|
||||
size_t num_modules;
|
||||
|
||||
if (mca_osc_pt2pt_component.progress_enable) {
|
||||
if (mca_osc_rdma_component.progress_enable) {
|
||||
opal_progress_unregister (component_progress);
|
||||
}
|
||||
|
||||
if (0 !=
|
||||
(num_modules = opal_hash_table_get_size(&mca_osc_pt2pt_component.modules))) {
|
||||
(num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.modules))) {
|
||||
opal_output(ompi_osc_base_framework.framework_output,
|
||||
"WARNING: There were %d Windows created but not freed.",
|
||||
(int) num_modules);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&mca_osc_pt2pt_component.frags);
|
||||
OBJ_DESTRUCT(&mca_osc_pt2pt_component.modules);
|
||||
OBJ_DESTRUCT(&mca_osc_pt2pt_component.lock);
|
||||
OBJ_DESTRUCT(&mca_osc_pt2pt_component.requests);
|
||||
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations);
|
||||
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations_lock);
|
||||
OBJ_DESTRUCT(&mca_osc_rdma_component.frags);
|
||||
OBJ_DESTRUCT(&mca_osc_rdma_component.modules);
|
||||
OBJ_DESTRUCT(&mca_osc_rdma_component.lock);
|
||||
OBJ_DESTRUCT(&mca_osc_rdma_component.requests);
|
||||
OBJ_DESTRUCT(&mca_osc_rdma_component.pending_operations);
|
||||
OBJ_DESTRUCT(&mca_osc_rdma_component.request_gc);
|
||||
OBJ_DESTRUCT(&mca_osc_rdma_component.buffer_gc);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -327,21 +326,27 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
|
||||
struct ompi_communicator_t *comm, struct ompi_info_t *info,
|
||||
int flavor, int *model)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module = NULL;
|
||||
ompi_osc_rdma_module_t *module = NULL;
|
||||
int ret;
|
||||
char *name;
|
||||
bool no_locks = false;
|
||||
|
||||
/* We don't support shared windows; that's for the sm onesided
|
||||
component */
|
||||
if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED;
|
||||
|
||||
if (check_config_value_bool("no_locks", info)) {
|
||||
no_locks = true;
|
||||
ompi_osc_rdma_no_locks = true;
|
||||
}
|
||||
|
||||
/* create module structure with all fields initialized to zero */
|
||||
module = (ompi_osc_pt2pt_module_t*)
|
||||
calloc(1, sizeof(ompi_osc_pt2pt_module_t));
|
||||
module = (ompi_osc_rdma_module_t*)
|
||||
calloc(1, sizeof(ompi_osc_rdma_module_t));
|
||||
if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
|
||||
/* fill in the function pointer part */
|
||||
memcpy(module, &ompi_osc_pt2pt_module_template,
|
||||
memcpy(module, &ompi_osc_rdma_module_template,
|
||||
sizeof(ompi_osc_base_module_t));
|
||||
|
||||
/* initialize the objects, so that always free in cleanup */
|
||||
@ -349,15 +354,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
|
||||
OBJ_CONSTRUCT(&module->cond, opal_condition_t);
|
||||
OBJ_CONSTRUCT(&module->acc_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&module->queued_frags, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->queued_frags_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&module->locks_pending, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->locks_pending_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->pending_acc, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->request_gc, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->buffer_gc, opal_list_t);
|
||||
OBJ_CONSTRUCT(&module->gc_lock, opal_mutex_t);
|
||||
|
||||
/* options */
|
||||
/* FIX ME: should actually check this value... */
|
||||
@ -385,14 +385,14 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
|
||||
"pt2pt component creating window with id %d",
|
||||
"rdma component creating window with id %d",
|
||||
ompi_comm_get_cid(module->comm)));
|
||||
|
||||
/* record my displacement unit. Always resolved at target */
|
||||
module->disp_unit = disp_unit;
|
||||
|
||||
/* peer data */
|
||||
module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_pt2pt_peer_t));
|
||||
module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_rdma_peer_t));
|
||||
if (NULL == module->peers) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
@ -405,6 +405,20 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (!no_locks) {
|
||||
module->passive_incoming_frag_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
|
||||
if (NULL == module->passive_incoming_frag_count) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
module->passive_incoming_frag_signal_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
|
||||
if (NULL == module->passive_incoming_frag_signal_count) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* the statement below (from Brian) does not seem correct so disable active target on the
|
||||
* window. if this end up being incorrect please revert this one change */
|
||||
module->active_eager_send_active = false;
|
||||
@ -415,35 +429,43 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
|
||||
module->active_eager_send_active = true;
|
||||
#endif
|
||||
|
||||
if (!no_locks) {
|
||||
module->passive_eager_send_active = malloc(sizeof(bool) * ompi_comm_size(comm));
|
||||
if (NULL == module->passive_eager_send_active) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* lock data */
|
||||
if (check_config_value_bool("no_locks", info)) {
|
||||
win->w_flags |= OMPI_WIN_NO_LOCKS;
|
||||
}
|
||||
|
||||
/* update component data */
|
||||
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.lock);
|
||||
ret = opal_hash_table_set_value_uint32(&mca_osc_pt2pt_component.modules,
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
ret = opal_hash_table_set_value_uint32(&mca_osc_rdma_component.modules,
|
||||
ompi_comm_get_cid(module->comm),
|
||||
module);
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.lock);
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
/* fill in window information */
|
||||
*model = MPI_WIN_UNIFIED;
|
||||
win->w_osc_module = (ompi_osc_base_module_t*) module;
|
||||
asprintf(&name, "pt2pt window %d", ompi_comm_get_cid(module->comm));
|
||||
asprintf(&name, "rdma window %d", ompi_comm_get_cid(module->comm));
|
||||
ompi_win_set_name(win, name);
|
||||
free(name);
|
||||
|
||||
/* sync memory - make sure all initialization completed */
|
||||
opal_atomic_mb();
|
||||
|
||||
module->incoming_buffer = malloc (mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t));
|
||||
module->incoming_buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t));
|
||||
if (OPAL_UNLIKELY(NULL == module->incoming_buffer)) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ret = ompi_osc_pt2pt_frag_start_receive (module);
|
||||
ret = ompi_osc_rdma_frag_start_receive (module);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
goto cleanup;
|
||||
}
|
||||
@ -454,30 +476,30 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
|
||||
module->comm->c_coll.coll_barrier_module);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
if (!mca_osc_pt2pt_component.progress_enable) {
|
||||
if (!mca_osc_rdma_component.progress_enable) {
|
||||
opal_progress_register (component_progress);
|
||||
mca_osc_pt2pt_component.progress_enable = true;
|
||||
mca_osc_rdma_component.progress_enable = true;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
|
||||
"done creating pt2pt window %d", ompi_comm_get_cid(module->comm)));
|
||||
"done creating rdma window %d", ompi_comm_get_cid(module->comm)));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
cleanup:
|
||||
/* set the module so we properly cleanup */
|
||||
win->w_osc_module = (ompi_osc_base_module_t*) module;
|
||||
ompi_osc_pt2pt_free (win);
|
||||
ompi_osc_rdma_free (win);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
|
||||
ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
|
||||
{
|
||||
ompi_osc_pt2pt_module_t *module =
|
||||
(ompi_osc_pt2pt_module_t*) win->w_osc_module;
|
||||
ompi_osc_rdma_module_t *module =
|
||||
(ompi_osc_rdma_module_t*) win->w_osc_module;
|
||||
|
||||
/* enforce collectiveness... */
|
||||
return module->comm->c_coll.coll_barrier(module->comm,
|
||||
@ -486,7 +508,7 @@ ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
|
||||
ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
|
||||
{
|
||||
ompi_info_t *info = OBJ_NEW(ompi_info_t);
|
||||
if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
@ -496,4 +518,4 @@ ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_t, opal_list_item_t, NULL, NULL);
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_t, opal_list_item_t, NULL, NULL);
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -18,22 +18,22 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
|
||||
#define OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
|
||||
#ifndef OMPI_MCA_OSC_RDMA_DATA_MOVE_H
|
||||
#define OMPI_MCA_OSC_RDMA_DATA_MOVE_H
|
||||
|
||||
#include "osc_pt2pt_header.h"
|
||||
#include "osc_rdma_header.h"
|
||||
|
||||
int ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module,
|
||||
int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module,
|
||||
int target,
|
||||
void *data,
|
||||
size_t len);
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_control_send_unbuffered:
|
||||
* ompi_osc_rdma_control_send_unbuffered:
|
||||
*
|
||||
* @short Send an unbuffered control message to a peer.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] target - Target rank
|
||||
* @param[in] data - Data to send
|
||||
* @param[in] len - Length of data
|
||||
@ -45,11 +45,11 @@ int ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module,
|
||||
* from its peer). The buffer specified by data will be available
|
||||
* when this call returns.
|
||||
*/
|
||||
int ompi_osc_pt2pt_control_send_unbuffered (ompi_osc_pt2pt_module_t *module,
|
||||
int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module,
|
||||
int target, void *data, size_t len);
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_isend_w_cb:
|
||||
* ompi_osc_rdma_isend_w_cb:
|
||||
*
|
||||
* @short Post a non-blocking send with a specified callback.
|
||||
*
|
||||
@ -66,11 +66,11 @@ int ompi_osc_pt2pt_control_send_unbuffered (ompi_osc_pt2pt_module_t *module,
|
||||
* be called with the associated request. The context specified in ctx will be stored in
|
||||
* the req_completion_cb_data member of the ompi_request_t for use by the callback.
|
||||
*/
|
||||
int ompi_osc_pt2pt_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
|
||||
int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
|
||||
ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx);
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_irecv_w_cb:
|
||||
* ompi_osc_rdma_irecv_w_cb:
|
||||
*
|
||||
* @short Post a non-blocking receive with a specified callback.
|
||||
*
|
||||
@ -89,49 +89,49 @@ int ompi_osc_pt2pt_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype,
|
||||
* request. The context specified in ctx will be stored in the req_completion_cb_data
|
||||
* member of the ompi_request_t for use by the callback.
|
||||
*/
|
||||
int ompi_osc_pt2pt_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag,
|
||||
int ompi_osc_rdma_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag,
|
||||
ompi_communicator_t *comm, ompi_request_t **request_out,
|
||||
ompi_request_complete_fn_t cb, void *ctx);
|
||||
|
||||
int ompi_osc_pt2pt_process_lock(ompi_osc_pt2pt_module_t* module,
|
||||
int ompi_osc_rdma_process_lock(ompi_osc_rdma_module_t* module,
|
||||
int source,
|
||||
struct ompi_osc_pt2pt_header_lock_t* lock_header);
|
||||
struct ompi_osc_rdma_header_lock_t* lock_header);
|
||||
|
||||
void ompi_osc_pt2pt_process_lock_ack(ompi_osc_pt2pt_module_t* module,
|
||||
struct ompi_osc_pt2pt_header_lock_ack_t* lock_header);
|
||||
void ompi_osc_rdma_process_lock_ack(ompi_osc_rdma_module_t* module,
|
||||
struct ompi_osc_rdma_header_lock_ack_t* lock_header);
|
||||
|
||||
int ompi_osc_pt2pt_process_unlock(ompi_osc_pt2pt_module_t* module,
|
||||
int ompi_osc_rdma_process_unlock(ompi_osc_rdma_module_t* module,
|
||||
int source,
|
||||
struct ompi_osc_pt2pt_header_unlock_t* lock_header);
|
||||
int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
|
||||
ompi_osc_pt2pt_header_flush_t *flush_header);
|
||||
struct ompi_osc_rdma_header_unlock_t* lock_header);
|
||||
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
|
||||
ompi_osc_rdma_header_flush_t *flush_header);
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_process_unlock_ack:
|
||||
* ompi_osc_rdma_process_unlock_ack:
|
||||
*
|
||||
* @short Process an incoming unlock acknowledgement.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] source - Source rank
|
||||
* @param[in] unlock_ack_header - Incoming unlock ack header
|
||||
*/
|
||||
void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
|
||||
ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header);
|
||||
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
|
||||
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header);
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_process_flush_ack:
|
||||
* ompi_osc_rdma_process_flush_ack:
|
||||
*
|
||||
* @short Process an incoming flush acknowledgement.
|
||||
*
|
||||
* @param[in] module - OSC PT2PT module
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] source - Source rank
|
||||
* @param[in] flush_ack_header - Incoming flush ack header
|
||||
*/
|
||||
void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
|
||||
ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header);
|
||||
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
|
||||
ompi_osc_rdma_header_flush_ack_t *flush_ack_header);
|
||||
|
||||
/**
|
||||
* ompi_osc_pt2pt_frag_start_receive:
|
||||
* ompi_osc_rdma_frag_start_receive:
|
||||
*
|
||||
* @short Start receiving fragments on the OSC module.
|
||||
*
|
||||
@ -140,6 +140,6 @@ void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int sour
|
||||
* @long This function starts receiving eager fragments on the module. The current
|
||||
* implementation uses the pml to transfer eager fragments.
|
||||
*/
|
||||
int ompi_osc_pt2pt_frag_start_receive (ompi_osc_pt2pt_module_t *module);
|
||||
int ompi_osc_rdma_frag_start_receive (ompi_osc_rdma_module_t *module);
|
||||
|
||||
#endif
|
197
ompi/mca/osc/rdma/osc_rdma_frag.c
Обычный файл
197
ompi/mca/osc/rdma/osc_rdma_frag.c
Обычный файл
@ -0,0 +1,197 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_frag.h"
|
||||
#include "osc_rdma_data_move.h"
|
||||
|
||||
static void ompi_osc_rdma_frag_constructor (ompi_osc_rdma_frag_t *frag){
|
||||
frag->buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t));
|
||||
assert (frag->buffer);
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_frag_destructor (ompi_osc_rdma_frag_t *frag) {
|
||||
if (NULL != frag->buffer) {
|
||||
free (frag->buffer);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_frag_t, opal_list_item_t,
|
||||
ompi_osc_rdma_frag_constructor, ompi_osc_rdma_frag_destructor);
|
||||
|
||||
static int frag_send_cb (ompi_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_frag_t *frag =
|
||||
(ompi_osc_rdma_frag_t*) request->req_complete_cb_data;
|
||||
ompi_osc_rdma_module_t *module = frag->module;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag_send complete to %d, frag = %p, request = %p",
|
||||
frag->target, (void *) frag, (void *) request));
|
||||
|
||||
mark_outgoing_completion(module);
|
||||
OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.frags, &frag->super);
|
||||
|
||||
|
||||
/* put this request on the garbage colletion list */
|
||||
osc_rdma_gc_add_request (request);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int
|
||||
frag_send(ompi_osc_rdma_module_t *module,
|
||||
ompi_osc_rdma_frag_t *frag)
|
||||
{
|
||||
int count;
|
||||
|
||||
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag_send called to %d, frag = %p, count = %d",
|
||||
frag->target, (void *) frag, count));
|
||||
|
||||
return ompi_osc_rdma_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_RDMA_FRAG_TAG,
|
||||
module->comm, frag_send_cb, frag);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module,
|
||||
ompi_osc_rdma_frag_t *frag)
|
||||
{
|
||||
int ret;
|
||||
|
||||
assert(0 == frag->pending);
|
||||
assert(module->peers[frag->target].active_frag != frag);
|
||||
|
||||
/* we need to signal now that a frag is outgoing to ensure the count sent
|
||||
* with the unlock message is correct */
|
||||
ompi_osc_signal_outgoing (module, frag->target, 1);
|
||||
|
||||
/* if eager sends are not active, can't send yet, so buffer and
|
||||
get out... */
|
||||
if (module->passive_target_access_epoch) {
|
||||
if (!module->passive_eager_send_active[frag->target]) {
|
||||
opal_list_append(&module->queued_frags, &frag->super);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
} else {
|
||||
if (!module->active_eager_send_active) {
|
||||
opal_list_append(&module->queued_frags, &frag->super);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
ret = frag_send(module, frag);
|
||||
|
||||
opal_condition_broadcast(&module->cond);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag flush target begin"));
|
||||
|
||||
/* flush the active frag */
|
||||
if (NULL != module->peers[target].active_frag) {
|
||||
ompi_osc_rdma_frag_t *frag = module->peers[target].active_frag;
|
||||
|
||||
if (0 != frag->pending) {
|
||||
/* communication going on while synchronizing; this is a bug */
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
module->peers[target].active_frag = NULL;
|
||||
|
||||
ret = ompi_osc_rdma_frag_start(module, frag);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag flush target finished active frag"));
|
||||
|
||||
/* walk through the pending list and send */
|
||||
ompi_osc_rdma_frag_t *frag, *next;
|
||||
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
|
||||
if (frag->target == target) {
|
||||
opal_list_remove_item(&module->queued_frags, &frag->super);
|
||||
ret = frag_send(module, frag);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag flush target finished"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
int i;
|
||||
ompi_osc_rdma_frag_t *frag, *next;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag flush all begin"));
|
||||
|
||||
/* flush the active frag */
|
||||
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
if (NULL != module->peers[i].active_frag) {
|
||||
ompi_osc_rdma_frag_t *frag = module->peers[i].active_frag;
|
||||
|
||||
if (0 != frag->pending) {
|
||||
/* communication going on while synchronizing; this is a bug */
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
module->peers[i].active_frag = NULL;
|
||||
|
||||
ret = ompi_osc_rdma_frag_start(module, frag);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag flush all finished active frag"));
|
||||
|
||||
/* try to start all the queued frags */
|
||||
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
|
||||
opal_list_remove_item(&module->queued_frags, &frag->super);
|
||||
ret = frag_send(module, frag);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: failure for frag send: %d", ret));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: frag flush all done"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
138
ompi/mca/osc/rdma/osc_rdma_frag.h
Обычный файл
138
ompi/mca/osc/rdma/osc_rdma_frag.h
Обычный файл
@ -0,0 +1,138 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OSC_RDMA_FRAG_H
|
||||
#define OSC_RDMA_FRAG_H
|
||||
|
||||
#include "ompi/communicator/communicator.h"
|
||||
|
||||
#include "osc_rdma_header.h"
|
||||
#include "osc_rdma_request.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
/** Communication buffer for packing messages */
|
||||
struct ompi_osc_rdma_frag_t {
|
||||
opal_list_item_t super;
|
||||
/* target rank of buffer */
|
||||
int target;
|
||||
unsigned char *buffer;
|
||||
|
||||
/* space remaining in buffer */
|
||||
size_t remain_len;
|
||||
|
||||
/* start of unused space */
|
||||
char *top;
|
||||
|
||||
/* Number of operations which have started writing into the frag, but not yet completed doing so */
|
||||
int pending;
|
||||
ompi_osc_rdma_frag_header_t *header;
|
||||
ompi_osc_rdma_module_t *module;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
|
||||
|
||||
extern int ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module, ompi_osc_rdma_frag_t *buffer);
|
||||
extern int ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target);
|
||||
extern int ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module);
|
||||
|
||||
|
||||
/*
|
||||
* Note: module lock must be held during this operation
|
||||
*/
|
||||
static inline int ompi_osc_rdma_frag_alloc(ompi_osc_rdma_module_t *module, int target,
|
||||
size_t request_len, ompi_osc_rdma_frag_t **buffer,
|
||||
char **ptr)
|
||||
{
|
||||
ompi_osc_rdma_frag_t *curr = module->peers[target].active_frag;
|
||||
int ret;
|
||||
|
||||
/* osc rdma headers can have 64-bit values. these will need to be aligned
|
||||
* on an 8-byte boundary on some architectures so we up align the allocation
|
||||
* size here. */
|
||||
request_len = OPAL_ALIGN(request_len, 8, size_t);
|
||||
|
||||
if (request_len > mca_osc_rdma_component.buffer_size) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (NULL == curr || curr->remain_len < request_len) {
|
||||
opal_free_list_item_t *item;
|
||||
|
||||
if (NULL != curr) {
|
||||
curr->remain_len = 0;
|
||||
/* If there's something pending, the pending finish will
|
||||
start the buffer. Otherwise, we need to start it now. */
|
||||
if (0 == curr->pending) {
|
||||
module->peers[target].active_frag = NULL;
|
||||
ret = ompi_osc_rdma_frag_start(module, curr);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_FREE_LIST_GET(&mca_osc_rdma_component.frags,
|
||||
item, ret);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
curr = module->peers[target].active_frag =
|
||||
(ompi_osc_rdma_frag_t*) item;
|
||||
|
||||
curr->target = target;
|
||||
|
||||
curr->header = (ompi_osc_rdma_frag_header_t*) curr->buffer;
|
||||
curr->top = (char*) (curr->header + 1);
|
||||
curr->remain_len = mca_osc_rdma_component.buffer_size;
|
||||
curr->module = module;
|
||||
curr->pending = 0;
|
||||
|
||||
curr->header->base.type = OMPI_OSC_RDMA_HDR_TYPE_FRAG;
|
||||
curr->header->base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
||||
if (module->passive_target_access_epoch) {
|
||||
curr->header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
|
||||
}
|
||||
curr->header->source = ompi_comm_rank(module->comm);
|
||||
curr->header->num_ops = 0;
|
||||
curr->header->windx = ompi_comm_get_cid(module->comm);
|
||||
|
||||
if (curr->remain_len < request_len) {
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
|
||||
*ptr = curr->top;
|
||||
*buffer = curr;
|
||||
|
||||
curr->top += request_len;
|
||||
curr->remain_len -= request_len;
|
||||
curr->pending++;
|
||||
curr->header->num_ops++;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Note: module lock must be held for this operation
|
||||
*/
|
||||
static inline int ompi_osc_rdma_frag_finish(ompi_osc_rdma_module_t *module,
|
||||
ompi_osc_rdma_frag_t* buffer)
|
||||
{
|
||||
if (0 == --buffer->pending && 0 == buffer->remain_len) {
|
||||
if (OPAL_LIKELY(buffer == module->peers[buffer->target].active_frag)) {
|
||||
/* this is the active fragment. need to set the current fragment to null
|
||||
* or it will be started multiple times */
|
||||
module->peers[buffer->target].active_frag = NULL;
|
||||
}
|
||||
return ompi_osc_rdma_frag_start(module, buffer);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
187
ompi/mca/osc/rdma/osc_rdma_header.h
Обычный файл
187
ompi/mca/osc/rdma/osc_rdma_header.h
Обычный файл
@ -0,0 +1,187 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_MCA_OSC_RDMA_HDR_H
|
||||
#define OMPI_MCA_OSC_RDMA_HDR_H
|
||||
|
||||
#ifdef HAVE_NETINET_IN_H
|
||||
#include <netinet/in.h>
|
||||
#endif
|
||||
|
||||
#include "opal/types.h"
|
||||
|
||||
enum ompi_osc_rdma_hdr_type_t {
|
||||
OMPI_OSC_RDMA_HDR_TYPE_PUT = 0x01,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG = 0x02,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_ACC = 0x03,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG = 0x04,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_GET = 0x05,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_CSWAP = 0x06,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_CSWAP_LONG = 0x07,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC = 0x08,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG = 0x09,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_COMPLETE = 0x10,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_POST = 0x11,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ = 0x12,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK = 0x13,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ = 0x14,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK = 0x15,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ = 0x16,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK = 0x17,
|
||||
OMPI_OSC_RDMA_HDR_TYPE_FRAG = 0x20,
|
||||
};
|
||||
typedef enum ompi_osc_rdma_hdr_type_t ompi_osc_rdma_hdr_type_t;
|
||||
|
||||
#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x01
|
||||
#define OMPI_OSC_RDMA_HDR_FLAG_VALID 0x02
|
||||
#define OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET 0x04
|
||||
#define OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE 0x08
|
||||
|
||||
struct ompi_osc_rdma_header_base_t {
|
||||
/** fragment type. 8 bits */
|
||||
uint8_t type;
|
||||
/** fragment flags. 8 bits */
|
||||
uint8_t flags;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_base_t ompi_osc_rdma_header_base_t;
|
||||
|
||||
struct ompi_osc_rdma_header_put_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
uint32_t count;
|
||||
uint64_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_put_t ompi_osc_rdma_header_put_t;
|
||||
|
||||
struct ompi_osc_rdma_header_acc_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
uint32_t count;
|
||||
uint32_t op;
|
||||
uint64_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_acc_t ompi_osc_rdma_header_acc_t;
|
||||
|
||||
struct ompi_osc_rdma_header_get_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
uint32_t count;
|
||||
uint64_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_get_t ompi_osc_rdma_header_get_t;
|
||||
|
||||
struct ompi_osc_rdma_header_complete_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
int frag_count;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_complete_t ompi_osc_rdma_header_complete_t;
|
||||
|
||||
struct ompi_osc_rdma_header_cswap_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
|
||||
uint16_t tag;
|
||||
|
||||
uint32_t len;
|
||||
uint64_t displacement;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_cswap_t ompi_osc_rdma_header_cswap_t;
|
||||
|
||||
struct ompi_osc_rdma_header_post_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
uint16_t windx;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_post_t ompi_osc_rdma_header_post_t;
|
||||
|
||||
struct ompi_osc_rdma_header_lock_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
int32_t lock_type;
|
||||
uint64_t serial_number;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_lock_t ompi_osc_rdma_header_lock_t;
|
||||
|
||||
struct ompi_osc_rdma_header_lock_ack_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
uint16_t windx;
|
||||
uint32_t source;
|
||||
uint64_t serial_number;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_lock_ack_t ompi_osc_rdma_header_lock_ack_t;
|
||||
|
||||
struct ompi_osc_rdma_header_unlock_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
int32_t lock_type;
|
||||
uint32_t frag_count;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_unlock_t ompi_osc_rdma_header_unlock_t;
|
||||
|
||||
struct ompi_osc_rdma_header_unlock_ack_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_unlock_ack_t ompi_osc_rdma_header_unlock_ack_t;
|
||||
|
||||
struct ompi_osc_rdma_header_flush_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
uint32_t frag_count;
|
||||
uint64_t serial_number;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_flush_t ompi_osc_rdma_header_flush_t;
|
||||
|
||||
struct ompi_osc_rdma_header_flush_ack_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
uint64_t serial_number;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_header_flush_ack_t ompi_osc_rdma_header_flush_ack_t;
|
||||
|
||||
struct ompi_osc_rdma_frag_header_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
uint16_t windx; /* cid of communicator backing window (our window id) */
|
||||
uint32_t source; /* rank in window of source process */
|
||||
uint16_t num_ops; /* number of operations in this buffer */
|
||||
uint16_t pad[3]; /* ensure the fragment header is a multiple of 8 bytes */
|
||||
};
|
||||
typedef struct ompi_osc_rdma_frag_header_t ompi_osc_rdma_frag_header_t;
|
||||
|
||||
union ompi_osc_rdma_header_t {
|
||||
ompi_osc_rdma_header_base_t base;
|
||||
ompi_osc_rdma_header_put_t put;
|
||||
ompi_osc_rdma_header_acc_t acc;
|
||||
ompi_osc_rdma_header_get_t get;
|
||||
ompi_osc_rdma_header_complete_t complete;
|
||||
ompi_osc_rdma_header_cswap_t cswap;
|
||||
ompi_osc_rdma_header_post_t post;
|
||||
ompi_osc_rdma_header_lock_t lock;
|
||||
ompi_osc_rdma_header_lock_ack_t lock_ack;
|
||||
ompi_osc_rdma_header_unlock_t unlock;
|
||||
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
|
||||
ompi_osc_rdma_header_flush_t flush;
|
||||
ompi_osc_rdma_header_flush_ack_t flush_ack;
|
||||
ompi_osc_rdma_frag_header_t frag;
|
||||
};
|
||||
typedef union ompi_osc_rdma_header_t ompi_osc_rdma_header_t;
|
||||
|
||||
#endif /* OMPI_MCA_OSC_RDMA_HDR_H */
|
47
ompi/mca/osc/rdma/osc_rdma_obj_convert.h
Обычный файл
47
ompi/mca/osc/rdma/osc_rdma_obj_convert.h
Обычный файл
@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/*
|
||||
* utility functions for dealing with remote datatype and op structures
|
||||
*/
|
||||
|
||||
/**
|
||||
* Convert a window index number into a module instance.
|
||||
*/
|
||||
static inline ompi_osc_rdma_module_t*
|
||||
ompi_osc_rdma_windx_to_module(uint32_t windx)
|
||||
{
|
||||
int ret;
|
||||
ompi_osc_rdma_module_t *module;
|
||||
|
||||
/* find the right module and dispatch */
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
ret = opal_hash_table_get_value_uint32(&mca_osc_rdma_component.modules,
|
||||
windx,
|
||||
(void**) (&module));
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
opal_output(0, "Could not translate windx %d to a local MPI_Win instance",
|
||||
windx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return module;
|
||||
}
|
966
ompi/mca/osc/rdma/osc_rdma_passive_target.c
Обычный файл
966
ompi/mca/osc/rdma/osc_rdma_passive_target.c
Обычный файл
@ -0,0 +1,966 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_header.h"
|
||||
#include "osc_rdma_data_move.h"
|
||||
#include "osc_rdma_frag.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
#include "opal/include/opal_stdint.h"
|
||||
|
||||
/* target-side tracking of a lock request */
|
||||
struct ompi_osc_rdma_pending_lock_t {
|
||||
opal_list_item_t super;
|
||||
int peer;
|
||||
int lock_type;
|
||||
uint64_t serial_number;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_pending_lock_t ompi_osc_rdma_pending_lock_t;
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_lock_t, opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
|
||||
/* origin-side tracking of a lock request */
|
||||
struct ompi_osc_rdma_outstanding_lock_t {
|
||||
opal_list_item_t super;
|
||||
int target;
|
||||
int32_t lock_acks_received;
|
||||
int32_t unlock_acks_received;
|
||||
int32_t flush_acks_received;
|
||||
uint64_t serial_number;
|
||||
int32_t type;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_outstanding_lock_t ompi_osc_rdma_outstanding_lock_t;
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_outstanding_lock_t, opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module);
|
||||
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
|
||||
int lock_type, uint64_t serial_number);
|
||||
|
||||
/**
|
||||
* Find the first outstanding lock to a target.
|
||||
*
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] target - Target rank
|
||||
*
|
||||
* @returns an outstanding lock on success
|
||||
*
|
||||
* This function traverses the outstanding_locks list in the module
|
||||
* looking for a lock that matches target. The caller must hold the
|
||||
* module lock.
|
||||
*/
|
||||
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock (ompi_osc_rdma_module_t *module, int target)
|
||||
{
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
||||
if (lock->target == target) {
|
||||
return lock;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_rdma_module_t *module, uint64_t serial_number)
|
||||
{
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
||||
if (lock->serial_number == serial_number) {
|
||||
return lock;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_lock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
|
||||
{
|
||||
const int my_rank = ompi_comm_rank (module->comm);
|
||||
|
||||
if ((MPI_LOCK_SHARED == lock->type && MPI_LOCK_EXCLUSIVE != module->lock_status) ||
|
||||
(MPI_LOCK_EXCLUSIVE == lock->type && 0 == module->lock_status)) {
|
||||
/* we can aquire the lock immediately */
|
||||
module->lock_status = lock->type;
|
||||
if (MPI_LOCK_SHARED == lock->type) {
|
||||
module->shared_count++;
|
||||
}
|
||||
|
||||
lock->lock_acks_received++;
|
||||
} else {
|
||||
/* queue the lock */
|
||||
queue_lock (module, my_rank, lock->type, lock->serial_number);
|
||||
}
|
||||
|
||||
/* If locking local, can't be non-blocking according to the
|
||||
standard. We need to wait for the ack here. */
|
||||
while (0 == lock->lock_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"local lock aquired"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void ompi_osc_rdma_unlock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
|
||||
{
|
||||
if (!(MPI_LOCK_SHARED == lock->type && 0 == --module->shared_count)) {
|
||||
module->lock_status = 0;
|
||||
ompi_osc_activate_next_lock (module);
|
||||
}
|
||||
|
||||
/* need to ensure we make progress */
|
||||
opal_progress();
|
||||
|
||||
lock->unlock_acks_received++;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_lock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
|
||||
{
|
||||
ompi_osc_rdma_header_lock_t lock_req;
|
||||
int ret;
|
||||
|
||||
/* generate a lock request */
|
||||
lock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ;
|
||||
lock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
|
||||
lock_req.lock_type = lock->type;
|
||||
lock_req.serial_number = lock->serial_number;
|
||||
|
||||
ret = ompi_osc_rdma_control_send (module, target, &lock_req, sizeof (lock_req));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* make sure the request gets sent, so we can start eager sending... */
|
||||
ret = ompi_osc_rdma_frag_flush_target (module, target);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_unlock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
|
||||
{
|
||||
ompi_osc_rdma_header_unlock_t unlock_req;
|
||||
|
||||
unlock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ;
|
||||
unlock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
|
||||
unlock_req.frag_count = module->epoch_outgoing_frag_count[target];
|
||||
unlock_req.lock_type = lock->type;
|
||||
|
||||
/* send control message with unlock request and count */
|
||||
return ompi_osc_rdma_control_send (module, target, &unlock_req, sizeof (unlock_req));
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
ompi_osc_rdma_peer_t *peer = module->peers + target;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
/* Check if no_locks is set. TODO: we also need to track whether we are in an
|
||||
* active target epoch. Fence can make this tricky to track. */
|
||||
if (NULL == module->passive_eager_send_active || module->sc_group) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
assert(module->epoch_outgoing_frag_count[target] == 0);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: lock %d %d", target, lock_type));
|
||||
|
||||
/* delay all eager sends until we've heard back.. */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
module->passive_eager_send_active[target] = false;
|
||||
module->passive_target_access_epoch = true;
|
||||
|
||||
/* when the lock ack returns we will be in an access epoch with this peer */
|
||||
peer->access_epoch = true;
|
||||
|
||||
/* create lock item */
|
||||
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
lock->target = target;
|
||||
lock->lock_acks_received = 0;
|
||||
lock->unlock_acks_received = 0;
|
||||
lock->serial_number = module->lock_serial_number++;
|
||||
lock->type = lock_type;
|
||||
opal_list_append(&module->outstanding_locks, &lock->super);
|
||||
|
||||
if (0 == (assert & MPI_MODE_NOCHECK)) {
|
||||
if (ompi_comm_rank (module->comm) != target) {
|
||||
ret = ompi_osc_rdma_lock_remote (module, target, lock);
|
||||
} else {
|
||||
ret = ompi_osc_rdma_lock_self (module, lock);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
goto exit_error;
|
||||
}
|
||||
} else {
|
||||
lock->lock_acks_received = 1;
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
exit_error:
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
opal_list_remove_item(&module->outstanding_locks, &lock->super);
|
||||
OBJ_RELEASE(lock);
|
||||
|
||||
/* return */
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_unlock(int target, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_outstanding_lock_t *lock = NULL;
|
||||
ompi_osc_rdma_peer_t *peer = module->peers + target;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
lock = find_outstanding_lock (module, target);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock: target %d is not locked in window %s",
|
||||
target, win->w_name));
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
if (ompi_comm_rank (module->comm) != target) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: unlock %d, lock_acks_received = %d", target,
|
||||
lock->lock_acks_received));
|
||||
|
||||
/* wait until ack has arrived from target */
|
||||
while (0 == lock->lock_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_unlock_remote (module, target, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_rdma_frag_flush_target(module, target);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* wait for all the requests and the unlock ack (meaning remote completion) */
|
||||
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
|
||||
0 == lock->unlock_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock: unlock of %d complete", target));
|
||||
} else {
|
||||
ompi_osc_rdma_unlock_self (module, lock);
|
||||
}
|
||||
|
||||
module->passive_eager_send_active[target] = false;
|
||||
module->epoch_outgoing_frag_count[target] = 0;
|
||||
module->passive_target_access_epoch = false;
|
||||
|
||||
peer->access_epoch = false;
|
||||
|
||||
/* delete the lock */
|
||||
opal_list_remove_item (&module->outstanding_locks, &lock->super);
|
||||
OBJ_RELEASE(lock);
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_lock_all(int assert, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
int ret, my_rank = ompi_comm_rank (module->comm);
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
|
||||
/* Check if no_locks is set. TODO: we also need to track whether we are in an active
|
||||
* target epoch. Fence can make this tricky to track. */
|
||||
if (NULL == module->passive_eager_send_active) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* delay all eager sends until we've heard back.. */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
module->passive_eager_send_active[i] = false;
|
||||
}
|
||||
module->passive_target_access_epoch = true;
|
||||
module->all_access_epoch = true;
|
||||
|
||||
/* create lock item */
|
||||
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
|
||||
lock->target = -1;
|
||||
lock->lock_acks_received = 0;
|
||||
lock->unlock_acks_received = 0;
|
||||
lock->serial_number = module->lock_serial_number++;
|
||||
lock->type = MPI_LOCK_SHARED;
|
||||
opal_list_append(&module->outstanding_locks, &lock->super);
|
||||
|
||||
/* if nocheck is not specified, send a lock request to everyone
|
||||
and wait for the local response */
|
||||
if (0 != (assert & MPI_MODE_NOCHECK)) {
|
||||
ret = ompi_osc_rdma_lock_self (module, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
goto exit_error;
|
||||
}
|
||||
|
||||
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
if (my_rank == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_lock_remote (module, i, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
opal_list_remove_item(&module->outstanding_locks, &lock->super);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
lock->lock_acks_received = ompi_comm_size(module->comm);
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
exit_error:
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
opal_list_remove_item(&module->outstanding_locks, &lock->super);
|
||||
OBJ_RELEASE(lock);
|
||||
|
||||
/* return */
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_unlock_all (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
int my_rank = ompi_comm_rank (module->comm);
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock_all entering..."));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
lock = find_outstanding_lock (module, -1);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock_all: not locked in window %s",
|
||||
win->w_name));
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* wait for lock acks */
|
||||
while (ompi_comm_size(module->comm) != lock->lock_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
/* send unlock messages to all of my peers */
|
||||
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
if (my_rank == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_unlock_remote (module, i, lock);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* unlock myself */
|
||||
ompi_osc_rdma_unlock_self (module, lock);
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_rdma_frag_flush_all(module);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
/* wait for all the requests and the unlock ack (meaning remote completion) */
|
||||
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
|
||||
ompi_comm_size(module->comm) != lock->unlock_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
/* reset all fragment counters */
|
||||
memset (module->epoch_outgoing_frag_count, 0, ompi_comm_size(module->comm) * sizeof (module->epoch_outgoing_frag_count[0]));
|
||||
memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * sizeof (module->passive_eager_send_active[0]));
|
||||
|
||||
opal_list_remove_item (&module->outstanding_locks, &lock->super);
|
||||
OBJ_RELEASE(lock);
|
||||
|
||||
module->passive_target_access_epoch = false;
|
||||
module->all_access_epoch = false;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock_all complete"));
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_sync (struct ompi_win_t *win)
|
||||
{
|
||||
opal_progress();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock,
|
||||
int target)
|
||||
{
|
||||
ompi_osc_rdma_header_flush_t flush_req;
|
||||
int peer_count, ret, flush_count;
|
||||
int my_rank = ompi_comm_rank (module->comm);
|
||||
|
||||
if (-1 == lock->target) {
|
||||
peer_count = ompi_comm_size(module->comm);
|
||||
} else {
|
||||
peer_count = 1;
|
||||
}
|
||||
|
||||
/* wait until ack has arrived from target, since we need to be
|
||||
able to eager send before we can transfer all the data... */
|
||||
while (peer_count > lock->lock_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
lock->flush_acks_received = 0;
|
||||
|
||||
flush_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ;
|
||||
flush_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
|
||||
flush_req.serial_number = lock->serial_number;
|
||||
|
||||
if (-1 == target) {
|
||||
/* NTH: no local flush */
|
||||
flush_count = ompi_comm_size(module->comm) - 1;
|
||||
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
||||
if (i == my_rank) {
|
||||
continue;
|
||||
}
|
||||
|
||||
flush_req.frag_count = module->epoch_outgoing_frag_count[i];
|
||||
|
||||
/* send control message with flush request and count */
|
||||
ret = ompi_osc_rdma_control_send (module, i, &flush_req, sizeof (flush_req));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_rdma_frag_flush_target (module, i);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
flush_req.frag_count = module->epoch_outgoing_frag_count[target];
|
||||
flush_count = 1;
|
||||
/* send control message with flush request and count */
|
||||
ret = ompi_osc_rdma_control_send (module, target, &flush_req, sizeof (flush_req));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* start all sendreqs to target */
|
||||
ret = ompi_osc_rdma_frag_flush_target (module, target);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* wait for all the requests and the flush ack (meaning remote completion) */
|
||||
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
|
||||
flush_count != lock->flush_acks_received) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
if (-1 == target) {
|
||||
memset (module->epoch_outgoing_frag_count, 0, peer_count * sizeof (module->epoch_outgoing_frag_count[0]));
|
||||
} else {
|
||||
module->epoch_outgoing_frag_count[target] = 0;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
int ret;
|
||||
|
||||
assert (0 <= target);
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!module->passive_target_access_epoch) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush starting..."));
|
||||
|
||||
if (ompi_comm_rank (module->comm) == target) {
|
||||
/* nothing to flush */
|
||||
opal_progress ();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
lock = find_outstanding_lock (module, target);
|
||||
if (NULL == lock) {
|
||||
lock = find_outstanding_lock (module, -1);
|
||||
}
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush: target %d is not locked in window %s",
|
||||
target, win->w_name));
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_flush_lock (module, lock, target);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!module->passive_target_access_epoch) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(0 == opal_list_get_size (&module->outstanding_locks))) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush_all: no targets are locked in window %s",
|
||||
win->w_name));
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush_all entering..."));
|
||||
|
||||
/* flush all locks */
|
||||
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
||||
ret = ompi_osc_rdma_flush_lock (module, lock, lock->target);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush_all complete"));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
int ret;
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!module->passive_target_access_epoch) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
ret = ompi_osc_rdma_frag_flush_target(module, target);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
/* wait for all the requests */
|
||||
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!module->passive_target_access_epoch) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
ret = ompi_osc_rdma_frag_flush_all(module);
|
||||
if (OMPI_SUCCESS != ret) goto cleanup;
|
||||
|
||||
/* wait for all the requests */
|
||||
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
|
||||
opal_condition_wait(&module->cond, &module->lock);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* target side operation to acknowledge to initiator side that the
|
||||
lock is now held by the initiator */
|
||||
static inline int activate_lock (ompi_osc_rdma_module_t *module, int requestor,
|
||||
uint64_t serial_number)
|
||||
{
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
|
||||
if (ompi_comm_rank (module->comm) != requestor) {
|
||||
ompi_osc_rdma_header_lock_ack_t lock_ack;
|
||||
|
||||
lock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK;
|
||||
lock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
||||
lock_ack.source = ompi_comm_rank(module->comm);
|
||||
lock_ack.windx = ompi_comm_get_cid(module->comm);
|
||||
lock_ack.serial_number = serial_number;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: sending lock to %d", requestor));
|
||||
|
||||
/* we don't want to send any data, since we're the exposure
|
||||
epoch only, so use an unbuffered send */
|
||||
return ompi_osc_rdma_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
|
||||
}
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: releasing local lock"));
|
||||
|
||||
lock = find_outstanding_lock (module, requestor);
|
||||
if (NULL == lock) {
|
||||
lock = find_outstanding_lock (module, -1);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
|
||||
"lock could not be located"));
|
||||
}
|
||||
}
|
||||
|
||||
lock->lock_acks_received++;
|
||||
opal_condition_broadcast (&module->cond);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* target side operation to create a pending lock request for a lock
|
||||
request that could not be satisfied */
|
||||
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
|
||||
int lock_type, uint64_t serial_number)
|
||||
{
|
||||
ompi_osc_rdma_pending_lock_t *pending =
|
||||
OBJ_NEW(ompi_osc_rdma_pending_lock_t);
|
||||
if (NULL == pending) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
pending->peer = requestor;
|
||||
pending->lock_type = lock_type;
|
||||
pending->serial_number = serial_number;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: queueing lock request from %d", requestor));
|
||||
|
||||
opal_list_append(&module->locks_pending, &pending->super);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module) {
|
||||
/* release any other pending locks we can */
|
||||
ompi_osc_rdma_pending_lock_t *pending_lock, *next;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
|
||||
ompi_osc_rdma_pending_lock_t) {
|
||||
if (MPI_LOCK_SHARED == pending_lock->lock_type) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_SHARED to peer %d\n",
|
||||
pending_lock->peer));
|
||||
/* acquire shared lock */
|
||||
module->lock_status = MPI_LOCK_SHARED;
|
||||
module->shared_count++;
|
||||
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
|
||||
|
||||
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
|
||||
OBJ_RELEASE(pending_lock);
|
||||
} else {
|
||||
if (0 == module->lock_status) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_EXCLUSIVE to peer %d\n",
|
||||
pending_lock->peer));
|
||||
/* acquire exclusive lock */
|
||||
module->lock_status = MPI_LOCK_EXCLUSIVE;
|
||||
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
|
||||
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
|
||||
OBJ_RELEASE(pending_lock);
|
||||
}
|
||||
/* if the lock was acquired (ie, status was 0), then
|
||||
we're done. If the lock was not acquired, we're
|
||||
also done, because all the shared locks have to
|
||||
finish first */
|
||||
break;
|
||||
}
|
||||
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/* target side function called when the initiator sends a lock
|
||||
request. Lock will either be activated and acknowledged or
|
||||
queued. */
|
||||
int ompi_osc_rdma_process_lock (ompi_osc_rdma_module_t* module, int source,
|
||||
ompi_osc_rdma_header_lock_t* lock_header)
|
||||
{
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_process_lock: processing lock request from %d. current lock state = %d, shared_count = %d",
|
||||
source, module->lock_status, module->shared_count));
|
||||
|
||||
if (MPI_LOCK_SHARED == lock_header->lock_type) {
|
||||
if (module->lock_status != MPI_LOCK_EXCLUSIVE) {
|
||||
/* acquire shared lock */
|
||||
module->lock_status = MPI_LOCK_SHARED;
|
||||
module->shared_count++;
|
||||
ret = activate_lock(module, source, lock_header->serial_number);
|
||||
} else {
|
||||
/* lock not available, queue */
|
||||
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
|
||||
}
|
||||
} else {
|
||||
if (0 == module->lock_status) {
|
||||
/* acquire exclusive lock */
|
||||
module->lock_status = MPI_LOCK_EXCLUSIVE;
|
||||
ret = activate_lock(module, source, lock_header->serial_number);
|
||||
} else {
|
||||
/* lock not available, queue */
|
||||
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/* initiator-side function called when the target acks the lock
|
||||
request. */
|
||||
void ompi_osc_rdma_process_lock_ack (ompi_osc_rdma_module_t *module,
|
||||
ompi_osc_rdma_header_lock_ack_t *lock_ack_header)
|
||||
{
|
||||
ompi_osc_rdma_outstanding_lock_t *lock, *next;
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(lock, next, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
||||
if (lock->serial_number == lock_ack_header->serial_number) {
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: lock ack %d", lock_ack_header->source));
|
||||
|
||||
lock->lock_acks_received++;
|
||||
module->passive_eager_send_active[lock_ack_header->source] = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
opal_output(ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: lock ack %d, %ld for unfindable lock request",
|
||||
lock_ack_header->source, (unsigned long) lock_ack_header->serial_number);
|
||||
}
|
||||
|
||||
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
|
||||
ompi_osc_rdma_header_flush_ack_t *flush_ack_header) {
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
|
||||
source, flush_ack_header->serial_number));
|
||||
|
||||
/* NTH: need to verify that this will work as expected */
|
||||
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
|
||||
assert (NULL != lock);
|
||||
|
||||
lock->flush_acks_received++;
|
||||
|
||||
opal_condition_broadcast(&module->cond);
|
||||
}
|
||||
|
||||
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
|
||||
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header) {
|
||||
ompi_osc_rdma_outstanding_lock_t *lock;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_process_unlock_ack: processing unlock ack from %d",
|
||||
source));
|
||||
|
||||
/* NTH: need to verify that this will work as expected */
|
||||
lock = find_outstanding_lock (module, source);
|
||||
if (NULL == lock) {
|
||||
lock = find_outstanding_lock(module, -1);
|
||||
assert (NULL != lock);
|
||||
}
|
||||
|
||||
lock->unlock_acks_received++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an unlock request.
|
||||
*
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] source - Source rank
|
||||
* @param[in] unlock_header - Incoming unlock header
|
||||
*
|
||||
* This functions is the target-side functio for handling an unlock
|
||||
* request. Once all pending operations from the target are complete
|
||||
* this functions sends an unlock acknowledgement then attempts to
|
||||
* active a pending lock if the lock becomes free.
|
||||
*/
|
||||
int ompi_osc_rdma_process_unlock (ompi_osc_rdma_module_t *module, int source,
|
||||
ompi_osc_rdma_header_unlock_t *unlock_header)
|
||||
{
|
||||
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_process_unlock entering (finished %d/%d)...",
|
||||
module->passive_incoming_frag_count[source],
|
||||
module->passive_incoming_frag_signal_count[source]));
|
||||
|
||||
/* we cannot block when processing an incoming request */
|
||||
if (module->passive_incoming_frag_signal_count[source] !=
|
||||
module->passive_incoming_frag_count[source]) {
|
||||
return OMPI_ERR_WOULD_BLOCK;
|
||||
}
|
||||
|
||||
unlock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK;
|
||||
unlock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
||||
|
||||
ret = ompi_osc_rdma_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
module->passive_incoming_frag_signal_count[source] = 0;
|
||||
module->passive_incoming_frag_count[source] = 0;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
if (unlock_header->lock_type == MPI_LOCK_EXCLUSIVE || 0 == --module->shared_count) {
|
||||
module->lock_status = 0;
|
||||
|
||||
ompi_osc_activate_next_lock (module);
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: finished processing unlock fragment"));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
|
||||
ompi_osc_rdma_header_flush_t *flush_header)
|
||||
{
|
||||
ompi_osc_rdma_header_flush_ack_t flush_ack;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_process_flush entering (finished %d/%d)...",
|
||||
module->passive_incoming_frag_count[source],
|
||||
module->passive_incoming_frag_signal_count[source]));
|
||||
|
||||
/* we cannot block when processing an incoming request */
|
||||
if (module->passive_incoming_frag_signal_count[source] !=
|
||||
module->passive_incoming_frag_count[source]) {
|
||||
return OMPI_ERR_WOULD_BLOCK;
|
||||
}
|
||||
|
||||
module->passive_incoming_frag_signal_count[source] = 0;
|
||||
module->passive_incoming_frag_count[source] = 0;
|
||||
|
||||
flush_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK;
|
||||
flush_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
||||
flush_ack.serial_number = flush_header->serial_number;
|
||||
|
||||
return ompi_osc_rdma_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
|
||||
}
|
@ -1,12 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* Pending frags are fragments that have been received on the target,
|
||||
@ -17,35 +14,35 @@
|
||||
* message.
|
||||
*/
|
||||
|
||||
#ifndef OSC_PT2PT_PENDING_FRAG_H
|
||||
#define OSC_PT2PT_PENDING_FRAG_H
|
||||
#ifndef OSC_RDMA_PENDING_FRAG_H
|
||||
#define OSC_RDMA_PENDING_FRAG_H
|
||||
|
||||
/** Incoming fragment that has to be queued */
|
||||
struct ompi_osc_pt2pt_pending_frag_t {
|
||||
struct ompi_osc_rdma_pending_frag_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
/* This is a pointer to the top of the fragment (which is always
|
||||
the header). Save as a header to make the casting a bit less
|
||||
onerous during sequence number lookups. */
|
||||
ompi_osc_pt2pt_frag_header_t *header;
|
||||
ompi_osc_rdma_frag_header_t *header;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_pending_frag_t ompi_osc_pt2pt_pending_frag_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_frag_t);
|
||||
typedef struct ompi_osc_rdma_pending_frag_t ompi_osc_rdma_pending_frag_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_frag_t);
|
||||
|
||||
/*
|
||||
* Note: module lock must be held during this operation
|
||||
*/
|
||||
static inline ompi_osc_pt2pt_pending_frag_t*
|
||||
ompi_osc_pt2pt_pending_frag_create(ompi_osc_pt2pt_module_t *module,
|
||||
static inline ompi_osc_rdma_pending_frag_t*
|
||||
ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module,
|
||||
void *ptr,
|
||||
size_t size)
|
||||
{
|
||||
size_t total_size = sizeof(ompi_osc_pt2pt_pending_frag_t) + size;
|
||||
ompi_osc_pt2pt_pending_frag_t *ret =
|
||||
(ompi_osc_pt2pt_pending_frag_t*) malloc(total_size);
|
||||
size_t total_size = sizeof(ompi_osc_rdma_pending_frag_t) + size;
|
||||
ompi_osc_rdma_pending_frag_t *ret =
|
||||
(ompi_osc_rdma_pending_frag_t*) malloc(total_size);
|
||||
if (NULL == ret) return NULL;
|
||||
|
||||
OBJ_CONSTRUCT(&ret, ompi_osc_pt2pt_pending_frag_t);
|
||||
OBJ_CONSTRUCT(&ret, ompi_osc_rdma_pending_frag_t);
|
||||
memcpy(ret->header, ptr, size);
|
||||
|
||||
return ret;
|
||||
@ -53,11 +50,11 @@ ompi_osc_pt2pt_pending_frag_create(ompi_osc_pt2pt_module_t *module,
|
||||
|
||||
|
||||
/*
|
||||
* Note: module lock must be held for this operation
|
||||
* Note: module lock must be held for this operation
|
||||
*/
|
||||
static inline int
|
||||
ompi_osc_pt2pt_pending_frag_destroy(ompi_osc_pt2pt_module_t *module,
|
||||
ompi_osc_pt2pt_pending_frag_t* frag)
|
||||
ompi_osc_rdma_pending_frag_destroy(ompi_osc_rdma_module_t *module,
|
||||
ompi_osc_rdma_pending_frag_t* frag)
|
||||
{
|
||||
OBJ_DESTRUCT(&frag);
|
||||
free(frag);
|
@ -1,12 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -17,8 +14,8 @@
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
|
||||
|
||||
#include "osc_pt2pt.h"
|
||||
#include "osc_pt2pt_request.h"
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_request.h"
|
||||
|
||||
static int
|
||||
request_cancel(struct ompi_request_t *request, int complete)
|
||||
@ -29,14 +26,14 @@ request_cancel(struct ompi_request_t *request, int complete)
|
||||
static int
|
||||
request_free(struct ompi_request_t **ompi_req)
|
||||
{
|
||||
ompi_osc_pt2pt_request_t *request =
|
||||
(ompi_osc_pt2pt_request_t*) *ompi_req;
|
||||
ompi_osc_rdma_request_t *request =
|
||||
(ompi_osc_rdma_request_t*) *ompi_req;
|
||||
|
||||
if (true != request->super.req_complete) {
|
||||
return MPI_ERR_REQUEST;
|
||||
}
|
||||
|
||||
OMPI_OSC_PT2PT_REQUEST_RETURN(request);
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(request);
|
||||
|
||||
*ompi_req = MPI_REQUEST_NULL;
|
||||
|
||||
@ -45,7 +42,7 @@ request_free(struct ompi_request_t **ompi_req)
|
||||
|
||||
static
|
||||
void
|
||||
request_construct(ompi_osc_pt2pt_request_t *request)
|
||||
request_construct(ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
request->super.req_type = OMPI_REQUEST_WIN;
|
||||
request->super.req_status._cancelled = 0;
|
||||
@ -53,7 +50,7 @@ request_construct(ompi_osc_pt2pt_request_t *request)
|
||||
request->super.req_cancel = request_cancel;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_request_t,
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
|
||||
ompi_request_t,
|
||||
request_construct,
|
||||
NULL);
|
@ -4,46 +4,46 @@
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_OSC_PT2PT_REQUEST_H
|
||||
#define OMPI_OSC_PT2PT_REQUEST_H
|
||||
#ifndef OMPI_OSC_RDMA_REQUEST_H
|
||||
#define OMPI_OSC_RDMA_REQUEST_H
|
||||
|
||||
#include "osc_pt2pt.h"
|
||||
#include "osc_rdma.h"
|
||||
|
||||
#include "ompi/request/request.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
struct ompi_osc_pt2pt_request_t {
|
||||
struct ompi_osc_rdma_request_t {
|
||||
ompi_request_t super;
|
||||
|
||||
int type;
|
||||
void *origin_addr;
|
||||
int origin_count;
|
||||
struct ompi_datatype_t *origin_dt;
|
||||
ompi_osc_pt2pt_module_t* module;
|
||||
int32_t outstanding_requests;
|
||||
ompi_osc_rdma_module_t* module;
|
||||
int outstanding_requests;
|
||||
bool internal;
|
||||
};
|
||||
typedef struct ompi_osc_pt2pt_request_t ompi_osc_pt2pt_request_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_request_t);
|
||||
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
|
||||
|
||||
/* REQUEST_ALLOC is only called from "top-level" functions (pt2pt_rput,
|
||||
pt2pt_rget, etc.), so it's ok to spin here... */
|
||||
#define OMPI_OSC_PT2PT_REQUEST_ALLOC(win, req) \
|
||||
/* REQUEST_ALLOC is only called from "top-level" functions (rdma_rput,
|
||||
rdma_rget, etc.), so it's ok to spin here... */
|
||||
#define OMPI_OSC_RDMA_REQUEST_ALLOC(win, req) \
|
||||
do { \
|
||||
ompi_free_list_item_t *item; \
|
||||
do { \
|
||||
OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.requests, item); \
|
||||
OMPI_FREE_LIST_GET_MT(&mca_osc_rdma_component.requests, item); \
|
||||
if (NULL == item) { \
|
||||
opal_progress(); \
|
||||
} \
|
||||
} while (NULL == item); \
|
||||
req = (ompi_osc_pt2pt_request_t*) item; \
|
||||
req = (ompi_osc_rdma_request_t*) item; \
|
||||
OMPI_REQUEST_INIT(&req->super, false); \
|
||||
req->super.req_mpi_object.win = win; \
|
||||
req->super.req_complete = false; \
|
||||
@ -52,14 +52,14 @@ OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_request_t);
|
||||
req->internal = false; \
|
||||
} while (0)
|
||||
|
||||
#define OMPI_OSC_PT2PT_REQUEST_RETURN(req) \
|
||||
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
|
||||
do { \
|
||||
OMPI_REQUEST_FINI(&(req)->super); \
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.requests, \
|
||||
(ompi_free_list_item_t *) (req)); \
|
||||
OMPI_REQUEST_FINI(&(req)->super); \
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_osc_rdma_component.requests, \
|
||||
(ompi_free_list_item_t *) (req)); \
|
||||
} while (0)
|
||||
|
||||
static inline void ompi_osc_pt2pt_request_complete (ompi_osc_pt2pt_request_t *request, int mpi_error)
|
||||
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error)
|
||||
{
|
||||
if (!request->internal) {
|
||||
request->super.req_status.MPI_ERROR = mpi_error;
|
||||
@ -67,8 +67,8 @@ static inline void ompi_osc_pt2pt_request_complete (ompi_osc_pt2pt_request_t *re
|
||||
/* mark the request complete at the mpi level */
|
||||
ompi_request_complete (&request->super, true);
|
||||
} else {
|
||||
OMPI_OSC_PT2PT_REQUEST_RETURN (request);
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN (request);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* OMPI_OSC_PT2PT_REQUEST_H */
|
||||
#endif /* OMPI_OSC_RDMA_REQUEST_H */
|
@ -513,7 +513,7 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
|
||||
fin->des_cbdata = NULL;
|
||||
|
||||
/* fill in header */
|
||||
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_segments->seg_addr.pval;
|
||||
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
|
||||
hdr->hdr_des = hdr_des;
|
||||
|
@ -284,7 +284,7 @@ void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) {
|
||||
|
||||
proc = (ompi_proc_t*) des->des_cbdata;
|
||||
bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
hdr = (mca_pml_bfo_fin_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
hdr = (mca_pml_bfo_fin_hdr_t*)des->des_local->seg_addr.pval;
|
||||
|
||||
opal_output_verbose(20, mca_pml_bfo_output,
|
||||
"REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d",
|
||||
@ -376,7 +376,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_recv_request_t* recvreq;
|
||||
ompi_proc_t* ompi_proc;
|
||||
@ -461,7 +461,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
@ -522,7 +522,7 @@ void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
@ -607,7 +607,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
@ -701,7 +701,7 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
|
||||
}
|
||||
|
||||
/* fill out header */
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||||
restart->hdr_match.hdr_common.hdr_flags = 0;
|
||||
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY;
|
||||
restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
@ -915,7 +915,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
|
||||
mca_btl_base_segment_t* oldseg;
|
||||
mca_btl_base_segment_t* newseg;
|
||||
|
||||
oldseg = des->des_segments;
|
||||
oldseg = des->des_local;
|
||||
/* The alloc routine must be called with the MCA_BTL_NO_ORDER
|
||||
* flag so that the allocation routine works. The allocation
|
||||
* will fill in the order flag in the descriptor. */
|
||||
@ -928,7 +928,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
|
||||
__FILE__, __LINE__);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
newseg = newdes->des_segments;
|
||||
newseg = newdes->des_local;
|
||||
/* Copy over all the data that is actually sent over the wire */
|
||||
memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len);
|
||||
newseg->seg_len = oldseg->seg_len;
|
||||
@ -972,7 +972,7 @@ mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
|
||||
mca_pml_bfo_restart_hdr_t* restart;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||||
sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval;
|
||||
|
||||
/* Need to resend this message in the case that it fails */
|
||||
@ -1061,7 +1061,7 @@ void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
|
||||
}
|
||||
|
||||
/* fill out header */
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||||
restart->hdr_match.hdr_common.hdr_flags = 0;
|
||||
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY;
|
||||
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
|
||||
@ -1145,7 +1145,7 @@ void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq
|
||||
}
|
||||
|
||||
/* fill out header */
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||||
restart->hdr_match.hdr_common.hdr_flags = 0;
|
||||
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK;
|
||||
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
|
||||
@ -1208,7 +1208,7 @@ void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
|
||||
ompi_proc = olddes->des_cbdata;
|
||||
}
|
||||
|
||||
segments = olddes->des_segments;
|
||||
segments = olddes->des_local;
|
||||
hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
bml_endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||||
@ -1226,7 +1226,7 @@ void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
|
||||
}
|
||||
|
||||
/* fill out header */
|
||||
nack = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
nack = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||||
nack->hdr_match.hdr_common.hdr_flags = 0;
|
||||
nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK;
|
||||
nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx;
|
||||
@ -1317,13 +1317,13 @@ void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
|
||||
int status )
|
||||
{
|
||||
if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
|
||||
mca_pml_bfo_common_hdr_t* common = des->des_segments->seg_addr.pval;
|
||||
mca_pml_bfo_common_hdr_t* common = des->des_local->seg_addr.pval;
|
||||
mca_pml_bfo_restart_hdr_t* restart; /* RESTART header */
|
||||
mca_pml_bfo_recv_request_t* recvreq;
|
||||
|
||||
switch (common->hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK:
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||||
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
|
||||
opal_output_verbose(30, mca_pml_bfo_output,
|
||||
"RNDVRESTARTACK: completion failed: try again "
|
||||
@ -1351,7 +1351,7 @@ void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
|
||||
mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true);
|
||||
break;
|
||||
case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY:
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||||
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
|
||||
/* With just two BTLs, this should never happen as we are
|
||||
* typically sending the RECVERRNOTIFY message on the
|
||||
@ -1759,7 +1759,7 @@ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status)
|
||||
{
|
||||
mca_pml_bfo_common_hdr_t * common = des->des_segments->seg_addr.pval;
|
||||
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval;
|
||||
mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */
|
||||
struct mca_btl_base_descriptor_t* rdma_des;
|
||||
mca_pml_bfo_recv_request_t* recvreq;
|
||||
@ -1789,7 +1789,7 @@ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
|
||||
break;
|
||||
|
||||
case MCA_PML_BFO_HDR_TYPE_PUT:
|
||||
hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_local->seg_addr.pval;
|
||||
rdma_des = hdr->hdr_des.pval;
|
||||
recvreq = des->des_cbdata;
|
||||
if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) {
|
||||
@ -1947,14 +1947,14 @@ void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t** bml_btl,
|
||||
struct mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
if ((*bml_btl)->btl != btl) {
|
||||
mca_pml_bfo_common_hdr_t * common = des->des_segments->seg_addr.pval;
|
||||
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval;
|
||||
mca_pml_bfo_ack_hdr_t* ack; /* ACK header */
|
||||
mca_pml_bfo_recv_request_t* recvreq = NULL;
|
||||
char *type = NULL;
|
||||
|
||||
switch (common->hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_ACK:
|
||||
ack = (mca_pml_bfo_ack_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
ack = (mca_pml_bfo_ack_hdr_t*)des->des_local->seg_addr.pval;
|
||||
recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval;
|
||||
type = "ACK";
|
||||
break;
|
||||
@ -2106,11 +2106,11 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||||
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
|
||||
mca_pml_bfo_send_request_t* sendreq)
|
||||
{
|
||||
assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_ctx ==
|
||||
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_ctx ==
|
||||
(sendreq)->req_send.req_base.req_comm->c_contextid);
|
||||
assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_src ==
|
||||
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_src ==
|
||||
(sendreq)->req_send.req_base.req_comm->c_my_rank);
|
||||
assert(((mca_pml_bfo_hdr_t*)((des)->des_segments->seg_addr.pval))->hdr_match.hdr_seq ==
|
||||
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_seq ==
|
||||
(uint16_t)(sendreq)->req_send.req_base.req_sequence);
|
||||
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) {
|
||||
(sendreq)->req_events--;
|
||||
@ -2157,7 +2157,7 @@ void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendre
|
||||
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
|
||||
mca_pml_bfo_hdr_t* hdr = des->des_segments->seg_addr.pval;
|
||||
mca_pml_bfo_hdr_t* hdr = des->des_local->seg_addr.pval;
|
||||
switch (hdr->hdr_common.hdr_type) {
|
||||
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||||
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
|
||||
|
@ -104,13 +104,13 @@ void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval;
|
||||
ompi_communicator_t *comm_ptr;
|
||||
mca_pml_bfo_recv_request_t *match = NULL;
|
||||
mca_pml_bfo_comm_t *comm;
|
||||
mca_pml_bfo_comm_proc_t *proc;
|
||||
size_t num_segments = des->des_segment_count;
|
||||
size_t num_segments = des->des_local_count;
|
||||
size_t bytes_received = 0;
|
||||
|
||||
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
|
||||
@ -257,7 +257,7 @@ void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
@ -265,7 +265,7 @@ void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
|
||||
}
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV);
|
||||
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_segment_count, MCA_PML_BFO_HDR_TYPE_RNDV);
|
||||
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RNDV);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -274,7 +274,7 @@ void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
|
||||
@ -282,7 +282,7 @@ void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl,
|
||||
}
|
||||
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET);
|
||||
mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_segment_count, MCA_PML_BFO_HDR_TYPE_RGET);
|
||||
des->des_local_count, MCA_PML_BFO_HDR_TYPE_RGET);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -293,7 +293,7 @@ void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
@ -341,7 +341,7 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_recv_request_t* recvreq;
|
||||
|
||||
@ -353,7 +353,7 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
#if PML_BFO
|
||||
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq);
|
||||
#endif /* PML_BFO */
|
||||
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
|
||||
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
|
||||
|
||||
return;
|
||||
}
|
||||
@ -363,7 +363,7 @@ void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_bfo_send_request_t* sendreq;
|
||||
|
||||
@ -386,7 +386,7 @@ void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||||
mca_btl_base_descriptor_t* rdma;
|
||||
|
||||
|
@ -246,7 +246,7 @@ int mca_pml_bfo_recv_request_ack_send_btl(
|
||||
}
|
||||
|
||||
/* fill out header */
|
||||
ack = (mca_pml_bfo_ack_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
ack = (mca_pml_bfo_ack_hdr_t*)des->des_local->seg_addr.pval;
|
||||
ack->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK;
|
||||
ack->hdr_common.hdr_flags = nordma ? MCA_PML_BFO_HDR_FLAGS_NORDMA : 0;
|
||||
ack->hdr_src_req.lval = hdr_src_req;
|
||||
@ -851,7 +851,7 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
|
||||
dst->des_cbfunc = mca_pml_bfo_put_completion;
|
||||
dst->des_cbdata = recvreq;
|
||||
|
||||
seg_size = btl->btl_seg_size * dst->des_segment_count;
|
||||
seg_size = btl->btl_seg_size * dst->des_local_count;
|
||||
|
||||
/* prepare a descriptor for rdma control message */
|
||||
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_rdma_hdr_t) + seg_size,
|
||||
@ -867,7 +867,7 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
|
||||
#endif /* PML_BFO */
|
||||
|
||||
/* fill in rdma header */
|
||||
hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_segments->seg_addr.pval;
|
||||
hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_PUT;
|
||||
hdr->hdr_common.hdr_flags =
|
||||
(!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0;
|
||||
@ -877,10 +877,10 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
|
||||
#endif /* PML_BFO */
|
||||
hdr->hdr_des.pval = dst;
|
||||
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
|
||||
hdr->hdr_seg_cnt = dst->des_segment_count;
|
||||
hdr->hdr_seg_cnt = dst->des_local_count;
|
||||
|
||||
/* copy segments */
|
||||
memmove (hdr + 1, dst->des_segments, seg_size);
|
||||
memmove (hdr + 1, dst->des_local, seg_size);
|
||||
|
||||
if(!recvreq->req_ack_sent)
|
||||
recvreq->req_ack_sent = true;
|
||||
|
@ -257,8 +257,8 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
|
||||
* have to be atomic.
|
||||
*/
|
||||
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_segments,
|
||||
des->des_segment_count,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count,
|
||||
sizeof(mca_pml_bfo_rendezvous_hdr_t));
|
||||
|
||||
#if PML_BFO
|
||||
@ -287,8 +287,8 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
|
||||
|
||||
/* count bytes of user data actually delivered and check for request completion */
|
||||
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_segments,
|
||||
des->des_segment_count, 0);
|
||||
(void *) des->des_local,
|
||||
des->des_local_count, 0);
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
||||
|
||||
send_request_pml_complete_check(sendreq);
|
||||
@ -357,8 +357,8 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
|
||||
|
||||
/* count bytes of user data actually delivered */
|
||||
req_bytes_delivered = mca_pml_bfo_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_segments,
|
||||
des->des_segment_count,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count,
|
||||
sizeof(mca_pml_bfo_frag_hdr_t));
|
||||
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
|
||||
@ -409,7 +409,7 @@ int mca_pml_bfo_send_request_start_buffered(
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* pack the data into the BTL supplied buffer */
|
||||
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
|
||||
@ -562,7 +562,7 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq,
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
if(size > 0) {
|
||||
/* pack the data into the supplied buffer */
|
||||
@ -657,7 +657,7 @@ int mca_pml_bfo_send_request_start_prepare( mca_pml_bfo_send_request_t* sendreq,
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
|
||||
@ -747,7 +747,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
|
||||
src->des_cbfunc = mca_pml_bfo_rget_completion;
|
||||
src->des_cbdata = sendreq;
|
||||
|
||||
seg_size = bml_btl->btl->btl_seg_size * src->des_segment_count;
|
||||
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count;
|
||||
|
||||
/* allocate space for get hdr + segment list */
|
||||
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
|
||||
@ -759,7 +759,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
|
||||
mca_bml_base_free(bml_btl, src);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
|
||||
@ -775,13 +775,13 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
|
||||
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET");
|
||||
#endif /* PML_BFO */
|
||||
hdr->hdr_rget.hdr_des.pval = src;
|
||||
hdr->hdr_rget.hdr_seg_cnt = src->des_segment_count;
|
||||
hdr->hdr_rget.hdr_seg_cnt = src->des_local_count;
|
||||
|
||||
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RGET,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* copy segment data */
|
||||
memmove (&hdr->hdr_rget + 1, src->des_segments, seg_size);
|
||||
memmove (&hdr->hdr_rget + 1, src->des_local, seg_size);
|
||||
|
||||
des->des_cbfunc = mca_pml_bfo_send_ctl_completion;
|
||||
|
||||
@ -808,7 +808,7 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
|
||||
if( OPAL_UNLIKELY(NULL == des)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* build hdr */
|
||||
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
|
||||
@ -912,7 +912,7 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* build hdr */
|
||||
hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
|
||||
@ -1145,7 +1145,7 @@ cannot_pack:
|
||||
des->des_cbdata = sendreq;
|
||||
|
||||
/* setup header */
|
||||
hdr = (mca_pml_bfo_frag_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
hdr = (mca_pml_bfo_frag_hdr_t*)des->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FRAG;
|
||||
hdr->hdr_frag_offset = range->range_send_offset;
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
|
||||
case MCA_PML_OB1_HDR_TYPE_RGET:
|
||||
type = "RGET";
|
||||
snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
|
||||
"frag %" PRIu64 " src_ptr %" PRIu64,
|
||||
"seg_cnt %d hdr_des %" PRIu64,
|
||||
hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
|
||||
hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
|
||||
hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
|
||||
hdr->hdr_rget.hdr_src_ptr);
|
||||
hdr->hdr_rndv.hdr_msg_length,
|
||||
hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval);
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_ACK:
|
||||
type = "ACK";
|
||||
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
|
||||
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64,
|
||||
hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
|
||||
hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
|
||||
hdr->hdr_ack.hdr_send_offset);
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FRAG:
|
||||
type = "FRAG";
|
||||
@ -520,11 +520,10 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_PUT:
|
||||
type = "PUT";
|
||||
snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64
|
||||
" dst_ptr %" PRIu64 " dst_size %" PRIu64,
|
||||
hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
|
||||
snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]",
|
||||
hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval,
|
||||
hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
|
||||
hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
|
||||
hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len);
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FIN:
|
||||
type = "FIN";
|
||||
@ -639,32 +638,37 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
|
||||
*/
|
||||
int mca_pml_ob1_send_fin( ompi_proc_t* proc,
|
||||
mca_bml_base_btl_t* bml_btl,
|
||||
opal_ptr_t hdr_frag,
|
||||
uint64_t rdma_size,
|
||||
opal_ptr_t hdr_des,
|
||||
uint8_t order,
|
||||
int status )
|
||||
uint32_t status )
|
||||
{
|
||||
mca_btl_base_descriptor_t* fin;
|
||||
mca_pml_ob1_fin_hdr_t* hdr;
|
||||
int rc;
|
||||
|
||||
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
|
||||
if(NULL == fin) {
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
fin->des_cbfunc = mca_pml_ob1_fin_completion;
|
||||
fin->des_cbdata = NULL;
|
||||
|
||||
/* fill in header */
|
||||
mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
|
||||
0, hdr_frag.lval, status ? status : (int64_t) rdma_size);
|
||||
hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
|
||||
hdr->hdr_des = hdr_des;
|
||||
hdr->hdr_fail = status;
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);
|
||||
|
||||
/* queue request */
|
||||
rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
|
||||
rc = mca_bml_base_send( bml_btl,
|
||||
fin,
|
||||
MCA_PML_OB1_HDR_TYPE_FIN );
|
||||
if( OPAL_LIKELY( rc >= 0 ) ) {
|
||||
if( OPAL_LIKELY( 1 == rc ) ) {
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
@ -672,7 +676,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
mca_bml_base_free(bml_btl, fin);
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
@ -713,7 +717,6 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
pckt->hdr.hdr_ack.hdr_src_req.lval,
|
||||
pckt->hdr.hdr_ack.hdr_dst_req.pval,
|
||||
pckt->hdr.hdr_ack.hdr_send_offset,
|
||||
pckt->hdr.hdr_ack.hdr_send_size,
|
||||
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
@ -725,10 +728,9 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
break;
|
||||
case MCA_PML_OB1_HDR_TYPE_FIN:
|
||||
rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
|
||||
pckt->hdr.hdr_fin.hdr_frag,
|
||||
pckt->hdr.hdr_fin.hdr_size,
|
||||
pckt->hdr.hdr_fin.hdr_des,
|
||||
pckt->order,
|
||||
pckt->status);
|
||||
pckt->hdr.hdr_fin.hdr_fail);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
return;
|
||||
}
|
||||
|
@ -216,7 +216,6 @@ struct mca_pml_ob1_pckt_pending_t {
|
||||
mca_pml_ob1_hdr_t hdr;
|
||||
struct mca_bml_base_btl_t *bml_btl;
|
||||
uint8_t order;
|
||||
int status;
|
||||
};
|
||||
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
|
||||
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
|
||||
@ -235,17 +234,17 @@ do { \
|
||||
(ompi_free_list_item_t*)pckt); \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \
|
||||
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \
|
||||
do { \
|
||||
mca_pml_ob1_pckt_pending_t *_pckt; \
|
||||
\
|
||||
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \
|
||||
mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \
|
||||
(D).lval, (Sz)); \
|
||||
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
|
||||
_pckt->hdr.hdr_fin.hdr_des = (D); \
|
||||
_pckt->hdr.hdr_fin.hdr_fail = (S); \
|
||||
_pckt->proc = (P); \
|
||||
_pckt->bml_btl = (B); \
|
||||
_pckt->order = (O); \
|
||||
_pckt->status = (S); \
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
|
||||
opal_list_append(&mca_pml_ob1.pckt_pending, \
|
||||
(opal_list_item_t*)_pckt); \
|
||||
@ -254,7 +253,7 @@ do { \
|
||||
|
||||
|
||||
int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
|
||||
opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status);
|
||||
opal_ptr_t hdr_des, uint8_t order, uint32_t status);
|
||||
|
||||
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
|
||||
* Packets are added to the queue when sending of FIN or ACK is failed due to
|
||||
@ -284,6 +283,20 @@ void mca_pml_ob1_process_pending_rdma(void);
|
||||
/*
|
||||
* Compute the total number of bytes on supplied descriptor
|
||||
*/
|
||||
static inline size_t
|
||||
mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments,
|
||||
size_t count, size_t hdrlen)
|
||||
{
|
||||
size_t i, length = 0;
|
||||
mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments;
|
||||
|
||||
for (i = 0; i < count ; ++i) {
|
||||
length += segment->seg_len;
|
||||
segment = (mca_btl_base_segment_t *)((char *)segment + seg_size);
|
||||
}
|
||||
return (length - hdrlen);
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments,
|
||||
size_t count, size_t hdrlen)
|
||||
@ -325,7 +338,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments,
|
||||
/* represent BTL chosen for sending request */
|
||||
struct mca_pml_ob1_com_btl_t {
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
struct mca_btl_base_registration_handle_t *btl_reg;
|
||||
struct mca_mpool_base_registration_t* btl_reg;
|
||||
size_t length;
|
||||
};
|
||||
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -63,13 +63,6 @@ struct mca_pml_ob1_common_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type,
|
||||
uint8_t hdr_flags)
|
||||
{
|
||||
hdr->hdr_type = hdr_type;
|
||||
hdr->hdr_flags = hdr_flags;
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_COMMON_HDR_NTOH(h)
|
||||
#define MCA_PML_OB1_COMMON_HDR_HTON(h)
|
||||
|
||||
@ -95,19 +88,15 @@ struct mca_pml_ob1_match_hdr_t {
|
||||
|
||||
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
|
||||
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags);
|
||||
hdr->hdr_ctx = hdr_ctx;
|
||||
hdr->hdr_src = hdr_src;
|
||||
hdr->hdr_tag = hdr_tag;
|
||||
hdr->hdr_seq = hdr_seq;
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
#endif
|
||||
}
|
||||
#define MCA_PML_OB1_MATCH_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_MATCH_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \
|
||||
do { \
|
||||
@ -121,6 +110,7 @@ do { \
|
||||
#define MCA_PML_OB1_MATCH_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_MATCH_HDR_FILL(h); \
|
||||
(h).hdr_ctx = htons((h).hdr_ctx); \
|
||||
(h).hdr_src = htonl((h).hdr_src); \
|
||||
(h).hdr_tag = htonl((h).hdr_tag); \
|
||||
@ -139,14 +129,12 @@ struct mca_pml_ob1_rendezvous_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
|
||||
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
|
||||
uint64_t hdr_msg_length, void *hdr_src_req)
|
||||
{
|
||||
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq);
|
||||
hdr->hdr_msg_length = hdr_msg_length;
|
||||
hdr->hdr_src_req.pval = hdr_src_req;
|
||||
}
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
#define MCA_PML_OB1_RNDV_HDR_FILL(h) \
|
||||
MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match)
|
||||
#else
|
||||
#define MCA_PML_OB1_RNDV_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
/* Note that hdr_src_req is not put in network byte order because it
|
||||
is never processed by the receiver, other than being copied into
|
||||
@ -160,6 +148,7 @@ static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hd
|
||||
#define MCA_PML_OB1_RNDV_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \
|
||||
MCA_PML_OB1_RNDV_HDR_FILL(h); \
|
||||
(h).hdr_msg_length = hton64((h).hdr_msg_length); \
|
||||
} while (0)
|
||||
|
||||
@ -168,47 +157,38 @@ static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hd
|
||||
*/
|
||||
struct mca_pml_ob1_rget_hdr_t {
|
||||
mca_pml_ob1_rendezvous_hdr_t hdr_rndv;
|
||||
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[4];
|
||||
#endif
|
||||
opal_ptr_t hdr_frag; /**< source fragment (for fin) */
|
||||
uint64_t hdr_src_ptr; /**< source pointer */
|
||||
/* btl registration handle data follows */
|
||||
opal_ptr_t hdr_des; /**< source descriptor */
|
||||
};
|
||||
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
|
||||
uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag,
|
||||
void *hdr_src_ptr, void *local_handle, size_t local_handle_size)
|
||||
{
|
||||
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags,
|
||||
hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
hdr->hdr_padding[2] = 0;
|
||||
hdr->hdr_padding[3] = 0;
|
||||
#endif
|
||||
hdr->hdr_frag.pval = hdr_frag;
|
||||
hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr;
|
||||
#define MCA_PML_OB1_RGET_HDR_FILL(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_RGET_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
/* copy registration handle */
|
||||
memcpy (hdr + 1, local_handle, local_handle_size);
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
(h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \
|
||||
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
(h).hdr_src_ptr = hton64((h).hdr_src_ptr); \
|
||||
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
|
||||
MCA_PML_OB1_RGET_HDR_FILL(h); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
@ -225,23 +205,19 @@ struct mca_pml_ob1_frag_hdr_t {
|
||||
};
|
||||
typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_frag_offset, void *hdr_src_req,
|
||||
uint64_t hdr_dst_req)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
hdr->hdr_padding[2] = 0;
|
||||
hdr->hdr_padding[3] = 0;
|
||||
hdr->hdr_padding[4] = 0;
|
||||
hdr->hdr_padding[5] = 0;
|
||||
#endif
|
||||
hdr->hdr_frag_offset = hdr_frag_offset;
|
||||
hdr->hdr_src_req.pval = hdr_src_req;
|
||||
hdr->hdr_dst_req.lval = hdr_dst_req;
|
||||
}
|
||||
#define MCA_PML_OB1_FRAG_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
(h).hdr_padding[4] = 0; \
|
||||
(h).hdr_padding[5] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_FRAG_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_OB1_FRAG_HDR_NTOH(h) \
|
||||
do { \
|
||||
@ -252,6 +228,7 @@ static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, ui
|
||||
#define MCA_PML_OB1_FRAG_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_FRAG_HDR_FILL(h); \
|
||||
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
|
||||
} while (0)
|
||||
|
||||
@ -267,45 +244,38 @@ struct mca_pml_ob1_ack_hdr_t {
|
||||
opal_ptr_t hdr_src_req; /**< source request */
|
||||
opal_ptr_t hdr_dst_req; /**< matched receive request */
|
||||
uint64_t hdr_send_offset; /**< starting point of copy in/out */
|
||||
uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */
|
||||
};
|
||||
typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_src_req, void *hdr_dst_req,
|
||||
uint64_t hdr_send_offset, uint64_t hdr_send_size)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
hdr->hdr_padding[2] = 0;
|
||||
hdr->hdr_padding[3] = 0;
|
||||
hdr->hdr_padding[4] = 0;
|
||||
hdr->hdr_padding[5] = 0;
|
||||
#endif
|
||||
hdr->hdr_src_req.lval = hdr_src_req;
|
||||
hdr->hdr_dst_req.pval = hdr_dst_req;
|
||||
hdr->hdr_send_offset = hdr_send_offset;
|
||||
hdr->hdr_send_size = hdr_send_size;
|
||||
}
|
||||
#define MCA_PML_OB1_ACK_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
(h).hdr_padding[2] = 0; \
|
||||
(h).hdr_padding[3] = 0; \
|
||||
(h).hdr_padding[4] = 0; \
|
||||
(h).hdr_padding[5] = 0; \
|
||||
} while (0)
|
||||
#else
|
||||
#define MCA_PML_OB1_ACK_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
/* Note that the request headers are not put in NBO because the
|
||||
src_req is already in receiver's byte order and the dst_req is not
|
||||
used by the receiver for anything other than backpointers in return
|
||||
headers */
|
||||
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
|
||||
(h).hdr_send_size = ntoh64((h).hdr_send_size); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_ACK_HDR_FILL(h); \
|
||||
(h).hdr_send_offset = hton64((h).hdr_send_offset); \
|
||||
(h).hdr_send_size = hton64((h).hdr_send_size); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
@ -317,55 +287,38 @@ struct mca_pml_ob1_rdma_hdr_t {
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
|
||||
#endif
|
||||
/* TODO: add real support for multiple destination segments */
|
||||
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
|
||||
opal_ptr_t hdr_req; /**< destination request */
|
||||
opal_ptr_t hdr_frag; /**< receiver fragment */
|
||||
opal_ptr_t hdr_des; /**< source descriptor */
|
||||
opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */
|
||||
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
|
||||
uint64_t hdr_dst_ptr; /**< destination address */
|
||||
uint64_t hdr_dst_size; /**< destination size */
|
||||
/* registration data follows */
|
||||
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
|
||||
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
|
||||
};
|
||||
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req,
|
||||
uint64_t hdr_rdma_offset, void *hdr_dst_ptr,
|
||||
uint64_t hdr_dst_size, void *local_handle,
|
||||
size_t local_handle_size)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
#endif
|
||||
hdr->hdr_req.lval = hdr_req;
|
||||
hdr->hdr_frag.pval = hdr_frag;
|
||||
hdr->hdr_recv_req.pval = hdr_recv_req;
|
||||
hdr->hdr_rdma_offset = hdr_rdma_offset;
|
||||
hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr;
|
||||
hdr->hdr_dst_size = hdr_dst_size;
|
||||
#define MCA_PML_OB1_RDMA_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while(0)
|
||||
#else
|
||||
#define MCA_PML_OB1_RDMA_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
/* copy segments */
|
||||
memcpy (hdr + 1, local_handle, local_handle_size);
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
|
||||
(h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \
|
||||
(h).hdr_dst_size = ntoh64((h).hdr_dst_size); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
MCA_PML_OB1_RDMA_HDR_FILL(h); \
|
||||
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
|
||||
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
|
||||
(h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \
|
||||
(h).hdr_dst_size = hton64((h).hdr_dst_size); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
@ -377,34 +330,31 @@ struct mca_pml_ob1_fin_hdr_t {
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t hdr_padding[2];
|
||||
#endif
|
||||
int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */
|
||||
opal_ptr_t hdr_frag; /**< completed RDMA fragment */
|
||||
uint32_t hdr_fail; /**< RDMA operation failed */
|
||||
opal_ptr_t hdr_des; /**< completed descriptor */
|
||||
};
|
||||
typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t;
|
||||
|
||||
static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags,
|
||||
uint64_t hdr_frag, int64_t hdr_size)
|
||||
{
|
||||
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags);
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
|
||||
hdr->hdr_padding[0] = 0;
|
||||
hdr->hdr_padding[1] = 0;
|
||||
#endif
|
||||
hdr->hdr_frag.lval = hdr_frag;
|
||||
hdr->hdr_size = hdr_size;
|
||||
}
|
||||
#define MCA_PML_OB1_FIN_HDR_FILL(h) \
|
||||
do { \
|
||||
(h).hdr_padding[0] = 0; \
|
||||
(h).hdr_padding[1] = 0; \
|
||||
} while (0)
|
||||
#else
|
||||
#define MCA_PML_OB1_FIN_HDR_FILL(h)
|
||||
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
|
||||
|
||||
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
|
||||
do { \
|
||||
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
|
||||
(h).hdr_size = ntoh64((h).hdr_size); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
|
||||
do { \
|
||||
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
|
||||
do { \
|
||||
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
|
||||
(h).hdr_size = hton64((h).hdr_size); \
|
||||
} while (0)
|
||||
MCA_PML_OB1_FIN_HDR_FILL(h); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Union of defined hdr types.
|
||||
|
@ -66,6 +66,7 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
|
||||
ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
|
||||
ompi_communicator_t * comm)
|
||||
{
|
||||
mca_btl_base_descriptor_t *des = NULL;
|
||||
mca_pml_ob1_match_hdr_t match;
|
||||
mca_bml_base_btl_t *bml_btl;
|
||||
OPAL_PTRDIFF_TYPE lb, extent;
|
||||
@ -93,21 +94,28 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
|
||||
opal_convertor_get_packed_size (&convertor, &size);
|
||||
}
|
||||
|
||||
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
comm->c_contextid, comm->c_my_rank,
|
||||
tag, seqn);
|
||||
match.hdr_common.hdr_flags = 0;
|
||||
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
match.hdr_ctx = comm->c_contextid;
|
||||
match.hdr_src = comm->c_my_rank;
|
||||
match.hdr_tag = tag;
|
||||
match.hdr_seq = seqn;
|
||||
|
||||
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);
|
||||
|
||||
/* try to send immediately */
|
||||
rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
|
||||
size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
|
||||
MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
|
||||
MCA_PML_OB1_HDR_TYPE_MATCH, &des);
|
||||
if (count > 0) {
|
||||
opal_convertor_cleanup (&convertor);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
if (des) {
|
||||
mca_bml_base_free (bml_btl, des);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -212,7 +220,7 @@ int mca_pml_ob1_send(void *buf,
|
||||
|
||||
OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t);
|
||||
sendreq->req_send.req_base.req_proc = dst_proc;
|
||||
sendreq->rdma_frag = NULL;
|
||||
sendreq->src_des = NULL;
|
||||
|
||||
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
|
||||
buf,
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -10,8 +9,6 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -30,6 +27,11 @@
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_rdma.h"
|
||||
|
||||
/* Use this registration if no registration needed for a BTL instead of NULL.
|
||||
* This will help other code to distinguish case when memory is not registered
|
||||
* from case when registration is not needed */
|
||||
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
|
||||
|
||||
/*
|
||||
* Check to see if memory is registered or can be registered. Build a
|
||||
* set of registrations on the request.
|
||||
@ -43,7 +45,7 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
{
|
||||
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
|
||||
double weight_total = 0;
|
||||
int num_btls_used = 0;
|
||||
int num_btls_used = 0, n;
|
||||
|
||||
/* shortcut when there are no rdma capable btls */
|
||||
if(num_btls == 0) {
|
||||
@ -51,25 +53,29 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
}
|
||||
|
||||
/* check to see if memory is registered */
|
||||
for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
|
||||
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
|
||||
n++) {
|
||||
mca_bml_base_btl_t* bml_btl =
|
||||
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
|
||||
(bml_endpoint->btl_rdma_index + n) % num_btls);
|
||||
mca_btl_base_registration_handle_t *reg_handle = NULL;
|
||||
mca_btl_base_module_t *btl = bml_btl->btl;
|
||||
(bml_endpoint->btl_rdma_index + n) % num_btls);
|
||||
mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg;
|
||||
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
|
||||
|
||||
if (btl->btl_register_mem) {
|
||||
/* try to register the memory with the btl */
|
||||
reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
|
||||
size, MCA_BTL_REG_FLAG_REMOTE_READ);
|
||||
if (NULL == reg_handle) {
|
||||
/* btl requires registration but the registration failed */
|
||||
continue;
|
||||
if( NULL != btl_mpool ) {
|
||||
if(!mca_pml_ob1.leave_pinned) {
|
||||
/* look through existing registrations */
|
||||
btl_mpool->mpool_find(btl_mpool, base, size, ®);
|
||||
} else {
|
||||
/* register the memory */
|
||||
btl_mpool->mpool_register(btl_mpool, base, size, 0, ®);
|
||||
}
|
||||
} /* else no registration is needed */
|
||||
|
||||
if(NULL == reg)
|
||||
continue;
|
||||
}
|
||||
|
||||
rdma_btls[num_btls_used].bml_btl = bml_btl;
|
||||
rdma_btls[num_btls_used].btl_reg = reg_handle;
|
||||
rdma_btls[num_btls_used].btl_reg = reg;
|
||||
weight_total += bml_btl->btl_weight;
|
||||
num_btls_used++;
|
||||
}
|
||||
@ -77,7 +83,7 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
/* if we don't use leave_pinned and all BTLs that already have this memory
|
||||
* registered amount to less then half of available bandwidth - fall back to
|
||||
* pipeline protocol */
|
||||
if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
|
||||
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
|
||||
return 0;
|
||||
|
||||
mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
|
||||
@ -97,6 +103,10 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
|
||||
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
|
||||
rdma_btls[i].bml_btl =
|
||||
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
|
||||
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
|
||||
rdma_btls[i].btl_reg = NULL;
|
||||
else
|
||||
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
|
||||
|
||||
weight_total += rdma_btls[i].bml_btl->btl_weight;
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -10,8 +9,6 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,13 +21,9 @@
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_rdmafrag.h"
|
||||
|
||||
static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag)
|
||||
{
|
||||
frag->local_handle = NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_pml_ob1_rdma_frag_t,
|
||||
ompi_free_list_item_t,
|
||||
mca_pml_ob1_rdma_frag_constructor,
|
||||
NULL,
|
||||
NULL);
|
||||
|
@ -10,8 +10,6 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -34,52 +32,38 @@ typedef enum {
|
||||
MCA_PML_OB1_RDMA_GET
|
||||
} mca_pml_ob1_rdma_state_t;
|
||||
|
||||
struct mca_pml_ob1_rdma_frag_t;
|
||||
|
||||
typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length);
|
||||
|
||||
/**
|
||||
* Used to keep track of local and remote RDMA operations.
|
||||
*/
|
||||
struct mca_pml_ob1_rdma_frag_t {
|
||||
ompi_free_list_item_t super;
|
||||
mca_bml_base_btl_t *rdma_bml;
|
||||
mca_bml_base_btl_t* rdma_bml;
|
||||
mca_pml_ob1_hdr_t rdma_hdr;
|
||||
mca_pml_ob1_rdma_state_t rdma_state;
|
||||
size_t rdma_length;
|
||||
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
|
||||
void *rdma_req;
|
||||
struct mca_bml_base_endpoint_t* rdma_ep;
|
||||
opal_convertor_t convertor;
|
||||
mca_mpool_base_registration_t* reg;
|
||||
uint32_t retries;
|
||||
mca_pml_ob1_rdma_frag_callback_t cbfunc;
|
||||
|
||||
uint64_t rdma_offset;
|
||||
void *local_address;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
|
||||
uint64_t remote_address;
|
||||
uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE];
|
||||
};
|
||||
typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t);
|
||||
|
||||
|
||||
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
|
||||
do { \
|
||||
ompi_free_list_item_t* item; \
|
||||
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
|
||||
do { \
|
||||
ompi_free_list_item_t* item; \
|
||||
OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \
|
||||
frag = (mca_pml_ob1_rdma_frag_t*)item; \
|
||||
frag = (mca_pml_ob1_rdma_frag_t*)item; \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
|
||||
do { \
|
||||
/* return fragment */ \
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
|
||||
(ompi_free_list_item_t*)frag); \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
|
||||
do { \
|
||||
/* return fragment */ \
|
||||
if (frag->local_handle) { \
|
||||
mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \
|
||||
frag->local_handle = NULL; \
|
||||
} \
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
|
||||
(ompi_free_list_item_t*)frag); \
|
||||
} while (0)
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
|
||||
ompi_communicator_t *comm_ptr;
|
||||
mca_pml_ob1_recv_request_t *match = NULL;
|
||||
mca_pml_ob1_comm_t *comm;
|
||||
mca_pml_ob1_comm_proc_t *proc;
|
||||
size_t num_segments = des->des_segment_count;
|
||||
size_t num_segments = des->des_local_count;
|
||||
size_t bytes_received = 0;
|
||||
|
||||
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
|
||||
@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
|
||||
}
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV);
|
||||
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV);
|
||||
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
|
||||
}
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
|
||||
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -292,10 +292,9 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_ob1_send_request_t* sendreq;
|
||||
size_t size;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
return;
|
||||
@ -308,25 +307,19 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
|
||||
/* if the request should be delivered entirely by copy in/out
|
||||
* then throttle sends */
|
||||
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
|
||||
if (NULL != sendreq->rdma_frag) {
|
||||
if (NULL != sendreq->rdma_frag->local_handle) {
|
||||
mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle);
|
||||
sendreq->rdma_frag->local_handle = NULL;
|
||||
}
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
|
||||
sendreq->rdma_frag = NULL;
|
||||
if (NULL != sendreq->src_des) {
|
||||
/* release registered memory */
|
||||
mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des);
|
||||
sendreq->src_des = NULL;
|
||||
}
|
||||
|
||||
sendreq->req_throttle_sends = true;
|
||||
}
|
||||
|
||||
if (hdr->hdr_ack.hdr_send_size) {
|
||||
size = hdr->hdr_ack.hdr_send_size;
|
||||
} else {
|
||||
size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
|
||||
}
|
||||
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
|
||||
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq,
|
||||
hdr->hdr_ack.hdr_send_offset,
|
||||
sendreq->req_send.req_bytes_packed -
|
||||
hdr->hdr_ack.hdr_send_offset);
|
||||
|
||||
if (sendreq->req_state != 0) {
|
||||
/* Typical receipt of an ACK message causes req_state to be
|
||||
@ -362,14 +355,13 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_ob1_recv_request_t* recvreq;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
|
||||
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
|
||||
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
@ -380,7 +372,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
|
||||
|
||||
/* This will trigger the opal_convertor_pack to start asynchronous copy. */
|
||||
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des);
|
||||
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des);
|
||||
|
||||
/* Let BTL know that it CANNOT free the frag */
|
||||
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
|
||||
@ -388,8 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
return;
|
||||
}
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
|
||||
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
|
||||
|
||||
return;
|
||||
}
|
||||
@ -399,7 +390,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_pml_ob1_send_request_t* sendreq;
|
||||
|
||||
@ -419,17 +410,20 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
void* cbdata ) {
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
|
||||
mca_pml_ob1_rdma_frag_t *frag;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
mca_btl_base_descriptor_t* rdma;
|
||||
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
|
||||
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN);
|
||||
frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
|
||||
frag->cbfunc (frag, hdr->hdr_size);
|
||||
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
|
||||
rdma->des_cbfunc(btl, NULL, rdma,
|
||||
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -705,7 +699,7 @@ out_of_order_match:
|
||||
OPAL_THREAD_UNLOCK(&comm->matching_lock);
|
||||
|
||||
if(OPAL_LIKELY(match)) {
|
||||
switch(type) {
|
||||
switch(type) {
|
||||
case MCA_PML_OB1_HDR_TYPE_MATCH:
|
||||
mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments);
|
||||
break;
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 FUJITSU LIMITED. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -150,17 +150,12 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque
|
||||
request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free;
|
||||
request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel;
|
||||
request->req_rdma_cnt = 0;
|
||||
request->local_handle = NULL;
|
||||
OBJ_CONSTRUCT(&request->lock, opal_mutex_t);
|
||||
}
|
||||
|
||||
static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request)
|
||||
{
|
||||
OBJ_DESTRUCT(&request->lock);
|
||||
if (OPAL_UNLIKELY(request->local_handle)) {
|
||||
mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle);
|
||||
request->local_handle = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
@ -188,27 +183,31 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl,
|
||||
* Put operation has completed remotely - update request status
|
||||
*/
|
||||
|
||||
static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size)
|
||||
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
{
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata;
|
||||
size_t bytes_received = 0;
|
||||
|
||||
if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
|
||||
bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count, 0);
|
||||
}
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
|
||||
if (OPAL_LIKELY(0 < rdma_size)) {
|
||||
assert ((uint64_t) rdma_size == frag->rdma_length);
|
||||
|
||||
/* check completion status */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
|
||||
if (recv_request_pml_complete_check(recvreq) == false &&
|
||||
/* check completion status */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
|
||||
if(recv_request_pml_complete_check(recvreq) == false &&
|
||||
recvreq->req_rdma_offset < recvreq->req_send_offset) {
|
||||
/* schedule additional rdma operations */
|
||||
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
|
||||
}
|
||||
/* schedule additional rdma operations */
|
||||
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
|
||||
}
|
||||
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
@ -219,7 +218,7 @@ static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t r
|
||||
int mca_pml_ob1_recv_request_ack_send_btl(
|
||||
ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
|
||||
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
|
||||
uint64_t size, bool nordma)
|
||||
bool nordma)
|
||||
{
|
||||
mca_btl_base_descriptor_t* des;
|
||||
mca_pml_ob1_ack_hdr_t* ack;
|
||||
@ -234,9 +233,12 @@ int mca_pml_ob1_recv_request_ack_send_btl(
|
||||
}
|
||||
|
||||
/* fill out header */
|
||||
ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0,
|
||||
hdr_src_req, hdr_dst_req, hdr_send_offset, size);
|
||||
ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval;
|
||||
ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK;
|
||||
ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0;
|
||||
ack->hdr_src_req.lval = hdr_src_req;
|
||||
ack->hdr_dst_req.pval = hdr_dst_req;
|
||||
ack->hdr_send_offset = hdr_send_offset;
|
||||
|
||||
ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc);
|
||||
|
||||
@ -310,99 +312,63 @@ static int mca_pml_ob1_recv_request_ack(
|
||||
if(recvreq->req_send_offset == hdr->hdr_msg_length)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* let know to shedule function there is no need to put ACK flag */
|
||||
recvreq->req_ack_sent = true;
|
||||
return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
|
||||
recvreq, recvreq->req_send_offset, 0,
|
||||
recvreq, recvreq->req_send_offset,
|
||||
recvreq->req_send_offset == bytes_received);
|
||||
}
|
||||
|
||||
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag);
|
||||
|
||||
static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
|
||||
{
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
|
||||
|
||||
if (OMPI_ERR_NOT_AVAILABLE == rc) {
|
||||
/* get isn't supported for this transfer. tell peer to fallback on put */
|
||||
rc = mca_pml_ob1_recv_request_put_frag (frag);
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
|
||||
OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* tell peer to fall back on send for this region */
|
||||
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
|
||||
recvreq, frag->rdma_offset, frag->rdma_length, false);
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return resources used by the RDMA
|
||||
*/
|
||||
|
||||
static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *cbdata, int status)
|
||||
static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
{
|
||||
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
|
||||
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
||||
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
|
||||
|
||||
/* check completion status */
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
|
||||
status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(status);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
} else {
|
||||
/* is receive request complete */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
|
||||
/* TODO: re-add order */
|
||||
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
|
||||
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
|
||||
frag->rdma_length, 0, 0);
|
||||
|
||||
recv_request_pml_complete_check(recvreq);
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(status);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
/* is receive request complete */
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
|
||||
if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) {
|
||||
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
|
||||
bml_btl,
|
||||
frag->rdma_hdr.hdr_rget.hdr_des,
|
||||
des->order, 0);
|
||||
}
|
||||
|
||||
recv_request_pml_complete_check(recvreq);
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
|
||||
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
|
||||
{
|
||||
static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
|
||||
mca_btl_base_descriptor_t *dst) {
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
mca_btl_base_descriptor_t *ctl;
|
||||
mca_pml_ob1_rdma_hdr_t *hdr;
|
||||
size_t reg_size;
|
||||
size_t seg_size;
|
||||
int rc;
|
||||
|
||||
reg_size = bml_btl->btl->btl_registration_handle_size;
|
||||
seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count;
|
||||
|
||||
/* prepare a descriptor for rdma control message */
|
||||
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size,
|
||||
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size,
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
|
||||
if (OPAL_UNLIKELY(NULL == ctl)) {
|
||||
@ -411,19 +377,26 @@ static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
|
||||
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
|
||||
|
||||
/* fill in rdma header */
|
||||
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval;
|
||||
mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0,
|
||||
recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset,
|
||||
frag->local_address, frag->rdma_length, frag->local_handle,
|
||||
reg_size);
|
||||
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
|
||||
hdr->hdr_common.hdr_flags =
|
||||
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
|
||||
|
||||
frag->cbfunc = mca_pml_ob1_put_completion;
|
||||
hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
|
||||
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
|
||||
hdr->hdr_des.pval = dst;
|
||||
hdr->hdr_recv_req.pval = recvreq;
|
||||
|
||||
recvreq->req_ack_sent = true;
|
||||
hdr->hdr_seg_cnt = dst->des_local_count;
|
||||
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(recvreq->req_recv.req_base), size,
|
||||
PERUSE_RECV);
|
||||
/* copy segments */
|
||||
memcpy (hdr + 1, dst->des_local, seg_size);
|
||||
|
||||
dst->des_cbfunc = mca_pml_ob1_put_completion;
|
||||
dst->des_cbdata = recvreq;
|
||||
|
||||
if (!recvreq->req_ack_sent)
|
||||
recvreq->req_ack_sent = true;
|
||||
|
||||
/* send rdma request to peer */
|
||||
rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
|
||||
@ -438,38 +411,71 @@ static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
|
||||
/*
|
||||
*
|
||||
*/
|
||||
int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag)
|
||||
int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag )
|
||||
{
|
||||
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
|
||||
mca_btl_base_registration_handle_t *local_handle = NULL;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
|
||||
mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
|
||||
mca_btl_base_descriptor_t* descriptor;
|
||||
size_t save_size = frag->rdma_length;
|
||||
int rc;
|
||||
|
||||
/* prepare descriptor */
|
||||
if (bml_btl->btl->btl_register_mem && !frag->local_handle && !recvreq->local_handle) {
|
||||
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE |
|
||||
MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle);
|
||||
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
|
||||
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
|
||||
mca_bml_base_prepare_dst( bml_btl,
|
||||
NULL,
|
||||
&recvreq->req_recv.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
0,
|
||||
&frag->rdma_length,
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
|
||||
MCA_BTL_DES_FLAGS_GET,
|
||||
&descriptor );
|
||||
if( OPAL_UNLIKELY(NULL == descriptor) ) {
|
||||
if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
|
||||
frag->rdma_length = save_size;
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
} else {
|
||||
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
|
||||
|
||||
/* tell peer to fall back on send */
|
||||
recvreq->req_send_offset = 0;
|
||||
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
|
||||
recvreq, recvreq->req_send_offset, true);
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
if (frag->local_handle) {
|
||||
local_handle = frag->local_handle;
|
||||
} else if (recvreq->local_handle) {
|
||||
local_handle = recvreq->local_handle;
|
||||
}
|
||||
descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
|
||||
descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
|
||||
descriptor->des_cbfunc = mca_pml_ob1_rget_completion;
|
||||
descriptor->des_cbdata = frag;
|
||||
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base),
|
||||
&(recvreq->req_recv.req_base),
|
||||
frag->rdma_length, PERUSE_RECV);
|
||||
|
||||
/* queue up get request */
|
||||
rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, local_handle,
|
||||
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
|
||||
0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag);
|
||||
rc = mca_bml_base_get(bml_btl,descriptor);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
|
||||
/* get isn't supported for this transfer. tell peer to fallback on put */
|
||||
rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
|
||||
}
|
||||
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
mca_bml_base_free(bml_btl, descriptor);
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending,
|
||||
(opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
} else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
OMPI_ERROR_LOG(rc);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -495,7 +501,6 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
|
||||
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t));
|
||||
data_offset = hdr->hdr_frag.hdr_frag_offset;
|
||||
|
||||
/*
|
||||
* Make user buffer accessible(defined) before unpacking.
|
||||
*/
|
||||
@ -623,6 +628,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval;
|
||||
mca_bml_base_endpoint_t* bml_endpoint = NULL;
|
||||
size_t bytes_remaining, prev_sent, offset;
|
||||
mca_btl_base_segment_t *r_segments;
|
||||
mca_pml_ob1_rdma_frag_t *frag;
|
||||
mca_bml_base_btl_t *rdma_bml;
|
||||
int rc;
|
||||
@ -630,7 +636,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
prev_sent = offset = 0;
|
||||
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
|
||||
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
|
||||
recvreq->req_send_offset = 0;
|
||||
|
||||
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
|
||||
|
||||
@ -674,28 +679,8 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
|
||||
|
||||
/* save the request for put fallback */
|
||||
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
|
||||
recvreq->rdma_bml = rdma_bml;
|
||||
|
||||
/* try to register the entire buffer */
|
||||
if (rdma_bml->btl->btl_register_mem) {
|
||||
void *data_ptr;
|
||||
|
||||
offset = 0;
|
||||
|
||||
OPAL_THREAD_LOCK(&recvreq->lock);
|
||||
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
|
||||
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
|
||||
OPAL_THREAD_UNLOCK(&recvreq->lock);
|
||||
|
||||
mca_bml_base_register_mem (rdma_bml, data_ptr, bytes_remaining, MCA_BTL_REG_FLAG_LOCAL_WRITE |
|
||||
MCA_BTL_REG_FLAG_REMOTE_WRITE, &recvreq->local_handle);
|
||||
/* It is not an error if the memory region can not be registered here. The registration will
|
||||
* be attempted again for each get fragment. */
|
||||
}
|
||||
bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1),
|
||||
hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc);
|
||||
|
||||
/* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
|
||||
* of bytes left to be send. In each iteration we send the max possible bytes supported
|
||||
@ -704,12 +689,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
* the next iteration with the updated size.
|
||||
* Also - In each iteration we update the location in the buffer to be used for writing
|
||||
* the message ,and the location to read from. This is done using the offset variable that
|
||||
* accumulates the number of bytes that were sent so far.
|
||||
*
|
||||
* NTH: This fragmentation may go away if we change the btls to require them to handle
|
||||
* get fragmentation internally. This is a reasonable solution since some btls do not
|
||||
* need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up
|
||||
* being the case. */
|
||||
* accumulates the number of bytes that were sent so far. */
|
||||
while (bytes_remaining > 0) {
|
||||
/* allocate/initialize a fragment */
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
@ -719,31 +699,29 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
|
||||
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
|
||||
|
||||
/* update the read location */
|
||||
frag->remote_address = hdr->hdr_src_ptr + offset;
|
||||
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
|
||||
|
||||
/* update the read location -- NTH: note this will only work if there is exactly one
|
||||
segment. TODO -- make this work with multiple segments */
|
||||
r_segments = (mca_btl_base_segment_t *) frag->rdma_segs;
|
||||
r_segments->seg_addr.lval += offset;
|
||||
|
||||
/* updating the write location */
|
||||
OPAL_THREAD_LOCK(&recvreq->lock);
|
||||
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
|
||||
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address);
|
||||
OPAL_THREAD_UNLOCK(&recvreq->lock);
|
||||
|
||||
frag->rdma_bml = rdma_bml;
|
||||
|
||||
frag->rdma_hdr.hdr_rget = *hdr;
|
||||
frag->retries = 0;
|
||||
frag->rdma_req = recvreq;
|
||||
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
|
||||
frag->local_handle = NULL;
|
||||
frag->rdma_offset = offset;
|
||||
|
||||
if (bytes_remaining > rdma_bml->btl->btl_get_limit) {
|
||||
frag->rdma_length = rdma_bml->btl->btl_get_limit;
|
||||
} else {
|
||||
frag->rdma_length = bytes_remaining;
|
||||
}
|
||||
frag->retries = 0;
|
||||
frag->rdma_req = recvreq;
|
||||
frag->rdma_ep = bml_endpoint;
|
||||
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
|
||||
frag->reg = NULL;
|
||||
frag->rdma_length = bytes_remaining;
|
||||
|
||||
/* NTH: TODO -- handle error conditions gracefully */
|
||||
rc = mca_pml_ob1_recv_request_get_frag(frag);
|
||||
@ -942,11 +920,13 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
|
||||
|
||||
while(bytes_remaining > 0 &&
|
||||
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
|
||||
mca_pml_ob1_rdma_frag_t *frag = NULL;
|
||||
mca_btl_base_module_t *btl;
|
||||
size_t size, seg_size;
|
||||
mca_pml_ob1_rdma_hdr_t* hdr;
|
||||
mca_btl_base_descriptor_t* dst;
|
||||
mca_btl_base_descriptor_t* ctl;
|
||||
mca_mpool_base_registration_t * reg = NULL;
|
||||
mca_btl_base_module_t* btl;
|
||||
int rc, rdma_idx;
|
||||
void *data_ptr;
|
||||
size_t size;
|
||||
|
||||
if(prev_bytes_remaining == bytes_remaining) {
|
||||
if(++num_fail == num_tries) {
|
||||
@ -967,62 +947,85 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
|
||||
do {
|
||||
rdma_idx = recvreq->req_rdma_idx;
|
||||
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
|
||||
reg = recvreq->req_rdma[rdma_idx].btl_reg;
|
||||
size = recvreq->req_rdma[rdma_idx].length;
|
||||
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
|
||||
recvreq->req_rdma_idx = 0;
|
||||
} while(!size);
|
||||
btl = bml_btl->btl;
|
||||
|
||||
/* NTH: This conditional used to check if there was a registration in
|
||||
* recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
|
||||
* the btl not needed registration (equivalent to btl->btl_register_mem
|
||||
* != NULL. This new check is equivalent. Note: I feel this protocol
|
||||
* needs work to better improve resource usage when running with a
|
||||
* leave pinned protocol. */
|
||||
if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
|
||||
(size > btl->btl_rdma_pipeline_frag_size)) {
|
||||
/* makes sure that we don't exceed BTL max rdma size
|
||||
* if memory is not pinned already */
|
||||
if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) &&
|
||||
(size > btl->btl_rdma_pipeline_frag_size)) {
|
||||
size = btl->btl_rdma_pipeline_frag_size;
|
||||
}
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
/* take lock to protect converter against concurrent access
|
||||
* from unpack */
|
||||
OPAL_THREAD_LOCK(&recvreq->lock);
|
||||
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor,
|
||||
&recvreq->req_rdma_offset );
|
||||
|
||||
/* prepare a descriptor for RDMA */
|
||||
mca_bml_base_prepare_dst(bml_btl, reg,
|
||||
&recvreq->req_recv.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_FLAGS_PUT, &dst);
|
||||
OPAL_THREAD_UNLOCK(&recvreq->lock);
|
||||
|
||||
if(OPAL_UNLIKELY(dst == NULL)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* take lock to protect convertor against concurrent access
|
||||
* from unpack */
|
||||
OPAL_THREAD_LOCK(&recvreq->lock);
|
||||
opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor,
|
||||
&recvreq->req_rdma_offset);
|
||||
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
|
||||
OPAL_THREAD_UNLOCK(&recvreq->lock);
|
||||
dst->des_cbfunc = mca_pml_ob1_put_completion;
|
||||
dst->des_cbdata = recvreq;
|
||||
|
||||
if (btl->btl_register_mem) {
|
||||
mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE,
|
||||
&frag->local_handle);
|
||||
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
continue;
|
||||
}
|
||||
seg_size = btl->btl_seg_size * dst->des_local_count;
|
||||
|
||||
/* prepare a descriptor for rdma control message */
|
||||
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size,
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
|
||||
|
||||
if( OPAL_UNLIKELY(NULL == ctl) ) {
|
||||
mca_bml_base_free(bml_btl,dst);
|
||||
continue;
|
||||
}
|
||||
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
|
||||
|
||||
/* fill in rdma header */
|
||||
hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
|
||||
hdr->hdr_common.hdr_flags =
|
||||
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
|
||||
hdr->hdr_req = recvreq->remote_req_send;
|
||||
hdr->hdr_des.pval = dst;
|
||||
hdr->hdr_recv_req.pval = recvreq;
|
||||
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
|
||||
hdr->hdr_seg_cnt = dst->des_local_count;
|
||||
|
||||
/* fill in the minimum information needed to handle the fin message */
|
||||
frag->cbfunc = mca_pml_ob1_put_completion;
|
||||
frag->rdma_length = size;
|
||||
frag->rdma_req = recvreq;
|
||||
frag->rdma_bml = bml_btl;
|
||||
frag->local_address = data_ptr;
|
||||
frag->rdma_offset = recvreq->req_rdma_offset;
|
||||
/* copy segments */
|
||||
memmove (hdr + 1, dst->des_local, seg_size);
|
||||
|
||||
rc = mca_pml_ob1_recv_request_put_frag (frag);
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
|
||||
if(!recvreq->req_ack_sent)
|
||||
recvreq->req_ack_sent = true;
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
|
||||
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(recvreq->req_recv.req_base), size,
|
||||
PERUSE_RECV);
|
||||
|
||||
/* send rdma request to peer */
|
||||
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
|
||||
if( OPAL_LIKELY( rc >= 0 ) ) {
|
||||
/* update request state */
|
||||
recvreq->req_rdma_offset += size;
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
|
||||
recvreq->req_rdma[rdma_idx].length -= size;
|
||||
bytes_remaining -= size;
|
||||
} else {
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
mca_bml_base_free(bml_btl,ctl);
|
||||
mca_bml_base_free(bml_btl,dst);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -53,8 +52,6 @@ struct mca_pml_ob1_recv_request_t {
|
||||
bool req_ack_sent; /**< whether ack was sent to the sender */
|
||||
bool req_match_received; /**< Prevent request to be completed prematurely */
|
||||
opal_mutex_t lock;
|
||||
mca_bml_base_btl_t *rdma_bml;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
mca_pml_ob1_com_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;
|
||||
@ -134,12 +131,8 @@ do { \
|
||||
#define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \
|
||||
{ \
|
||||
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
|
||||
if ((recvreq)->local_handle) { \
|
||||
mca_bml_base_deregister_mem ((recvreq)->rdma_bml, (recvreq)->local_handle); \
|
||||
(recvreq)->local_handle = NULL; \
|
||||
} \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
|
||||
(ompi_free_list_item_t*)(recvreq)); \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
|
||||
(ompi_free_list_item_t*)(recvreq)); \
|
||||
}
|
||||
|
||||
/**
|
||||
@ -161,11 +154,9 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
|
||||
}
|
||||
|
||||
for(i = 0; i < recvreq->req_rdma_cnt; i++) {
|
||||
struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg;
|
||||
mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl;
|
||||
|
||||
if (NULL != handle) {
|
||||
mca_bml_base_deregister_mem (bml_btl, handle);
|
||||
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
|
||||
if( NULL != btl_reg && btl_reg->mpool != NULL) {
|
||||
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
|
||||
}
|
||||
}
|
||||
recvreq->req_rdma_cnt = 0;
|
||||
@ -187,10 +178,6 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
|
||||
MPI_ERR_TRUNCATE;
|
||||
}
|
||||
if (OPAL_UNLIKELY(recvreq->local_handle)) {
|
||||
mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle);
|
||||
recvreq->local_handle = NULL;
|
||||
}
|
||||
MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
||||
@ -400,7 +387,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
|
||||
(void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
|
||||
}
|
||||
|
||||
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \
|
||||
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \
|
||||
do { \
|
||||
mca_pml_ob1_pckt_pending_t *_pckt; \
|
||||
\
|
||||
@ -409,7 +396,6 @@ static inline void mca_pml_ob1_recv_request_schedule(
|
||||
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
|
||||
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
|
||||
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \
|
||||
_pckt->hdr.hdr_ack.hdr_send_size = (Sz); \
|
||||
_pckt->proc = (P); \
|
||||
_pckt->bml_btl = NULL; \
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
|
||||
@ -420,11 +406,11 @@ static inline void mca_pml_ob1_recv_request_schedule(
|
||||
|
||||
int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
|
||||
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
|
||||
uint64_t hdr_rdma_offset, uint64_t size, bool nordma);
|
||||
uint64_t hdr_rdma_offset, bool nordma);
|
||||
|
||||
static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
|
||||
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
|
||||
uint64_t size, bool nordma)
|
||||
bool nordma)
|
||||
{
|
||||
size_t i;
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
@ -434,12 +420,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
|
||||
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
|
||||
hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS)
|
||||
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
|
||||
hdr_send_offset, size);
|
||||
hdr_send_offset);
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
@ -137,7 +137,6 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
|
||||
req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
|
||||
req->req_rdma_cnt = 0;
|
||||
req->req_throttle_sends = false;
|
||||
req->rdma_frag = NULL;
|
||||
OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
|
||||
OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
|
||||
}
|
||||
@ -146,10 +145,6 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
|
||||
{
|
||||
OBJ_DESTRUCT(&req->req_send_ranges);
|
||||
OBJ_DESTRUCT(&req->req_send_range_lock);
|
||||
if (req->rdma_frag) {
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag);
|
||||
req->rdma_frag = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
|
||||
@ -241,9 +236,10 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
|
||||
* happens in one thread, the increase of the req_bytes_delivered does not
|
||||
* have to be atomic.
|
||||
*/
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
|
||||
des->des_segment_count,
|
||||
sizeof(mca_pml_ob1_rendezvous_hdr_t));
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count,
|
||||
sizeof(mca_pml_ob1_rendezvous_hdr_t));
|
||||
|
||||
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
|
||||
}
|
||||
@ -254,18 +250,27 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
|
||||
*/
|
||||
|
||||
static void
|
||||
mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
|
||||
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
{
|
||||
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
||||
size_t req_bytes_delivered;
|
||||
|
||||
/* count bytes of user data actually delivered and check for request completion */
|
||||
if (OPAL_LIKELY(0 < rdma_length)) {
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length);
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count, 0);
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
||||
}
|
||||
sendreq->src_des = NULL;
|
||||
|
||||
send_request_pml_complete_check(sendreq);
|
||||
|
||||
/* free the descriptor */
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
@ -309,9 +314,10 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
/* count bytes of user data actually delivered */
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
|
||||
des->des_segment_count,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t));
|
||||
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
|
||||
(void *) des->des_local,
|
||||
des->des_local_count,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t));
|
||||
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
|
||||
@ -382,7 +388,7 @@ int mca_pml_ob1_send_request_start_buffered(
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* pack the data into the BTL supplied buffer */
|
||||
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
|
||||
@ -401,14 +407,17 @@ int mca_pml_ob1_send_request_start_buffered(
|
||||
|
||||
/* build rendezvous header */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||||
sendreq->req_send.req_bytes_packed, sendreq);
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
||||
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* update lengths */
|
||||
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
|
||||
@ -481,13 +490,15 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
if(NULL != bml_btl->btl->btl_sendi) {
|
||||
mca_pml_ob1_match_hdr_t match;
|
||||
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence);
|
||||
match.hdr_common.hdr_flags = 0;
|
||||
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
|
||||
ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* try to send immediately */
|
||||
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
|
||||
@ -520,7 +531,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
if(size > 0) {
|
||||
/* pack the data into the supplied buffer */
|
||||
@ -554,13 +565,15 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence);
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* update lengths */
|
||||
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
|
||||
@ -604,6 +617,7 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
/* prepare descriptor */
|
||||
mca_bml_base_prepare_src( bml_btl,
|
||||
NULL,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
OMPI_PML_OB1_MATCH_HDR_LEN,
|
||||
@ -613,17 +627,19 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence);
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* short message */
|
||||
des->des_cbfunc = mca_pml_ob1_match_completion_free;
|
||||
@ -657,67 +673,79 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
|
||||
* one RDMA capable BTLs). This way round robin distribution of RDMA
|
||||
* operation is achieved.
|
||||
*/
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
mca_btl_base_descriptor_t *des;
|
||||
mca_pml_ob1_rdma_frag_t *frag;
|
||||
|
||||
mca_btl_base_descriptor_t *des, *src = NULL;
|
||||
mca_pml_ob1_rget_hdr_t *hdr;
|
||||
size_t reg_size;
|
||||
void *data_ptr;
|
||||
size_t seg_size;
|
||||
int rc;
|
||||
|
||||
sendreq->src_des = NULL;
|
||||
|
||||
bml_btl = sendreq->req_rdma[0].bml_btl;
|
||||
if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
|
||||
sendreq->rdma_frag = NULL;
|
||||
/* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
|
||||
return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
|
||||
MCA_PML_OB1_HDR_FLAGS_PIN);
|
||||
}
|
||||
|
||||
/* at this time ob1 does not support non-contiguous gets. the convertor represents a
|
||||
* contiguous block of memory */
|
||||
opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
|
||||
|
||||
local_handle = sendreq->req_rdma[0].btl_reg;
|
||||
|
||||
/* allocate an rdma fragment to keep track of the request size for use in the fin message */
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_defined,
|
||||
sendreq->req_send.req_base.req_addr,
|
||||
sendreq->req_send.req_base.req_count,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
/* prepare source descriptor/segment(s) */
|
||||
/* PML owns this descriptor and will free it in */
|
||||
/* mca_pml_ob1_rget_completion */
|
||||
mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET |
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src );
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
||||
sendreq->req_send.req_base.req_addr,
|
||||
sendreq->req_send.req_base.req_count,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
if( OPAL_UNLIKELY(NULL == src) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
src->des_cbfunc = mca_pml_ob1_rget_completion;
|
||||
src->des_cbdata = sendreq;
|
||||
|
||||
/* fill in necessary fragment data */
|
||||
frag->rdma_req = sendreq;
|
||||
frag->rdma_bml = bml_btl;
|
||||
frag->rdma_length = size;
|
||||
frag->cbfunc = mca_pml_ob1_rget_completion;
|
||||
/* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
|
||||
sendreq->src_des = src;
|
||||
|
||||
/* save the fragment for get->put fallback */
|
||||
sendreq->rdma_frag = frag;
|
||||
|
||||
reg_size = bml_btl->btl->btl_registration_handle_size;
|
||||
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count;
|
||||
|
||||
/* allocate space for get hdr + segment list */
|
||||
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
|
||||
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size,
|
||||
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
/* NTH: no need to reset the converter here. it will be reset before it is retried */
|
||||
mca_bml_base_free(bml_btl, src);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* build match header */
|
||||
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval;
|
||||
/* TODO -- Add support for multiple segments for get */
|
||||
mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||||
sendreq->req_send.req_bytes_packed, sendreq,
|
||||
frag, data_ptr, local_handle, reg_size);
|
||||
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval;
|
||||
|
||||
hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN;
|
||||
hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET;
|
||||
hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
||||
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
||||
hdr->hdr_des.pval = src;
|
||||
hdr->hdr_seg_cnt = src->des_local_count;
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* copy segment data */
|
||||
memcpy (hdr + 1, src->des_local, seg_size);
|
||||
|
||||
des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
|
||||
des->des_cbdata = sendreq;
|
||||
|
||||
@ -735,6 +763,12 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
|
||||
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
|
||||
if (OPAL_UNLIKELY(rc < 0)) {
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
|
||||
if (sendreq->src_des) {
|
||||
mca_bml_base_free (bml_btl, sendreq->src_des);
|
||||
sendreq->src_des = NULL;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -772,6 +806,7 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
mca_bml_base_prepare_src( bml_btl,
|
||||
NULL,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
sizeof(mca_pml_ob1_rendezvous_hdr_t),
|
||||
@ -789,18 +824,21 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
segment = des->des_segments;
|
||||
segment = des->des_local;
|
||||
|
||||
/* build hdr */
|
||||
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
|
||||
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags,
|
||||
sendreq->req_send.req_base.req_comm->c_contextid,
|
||||
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||||
sendreq->req_send.req_base.req_tag,
|
||||
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||||
sendreq->req_send.req_bytes_packed, sendreq);
|
||||
hdr->hdr_common.hdr_flags = flags;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
|
||||
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||||
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||||
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
||||
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||||
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
||||
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
|
||||
/* first fragment of a long message */
|
||||
des->des_cbdata = sendreq;
|
||||
@ -981,8 +1019,10 @@ cannot_pack:
|
||||
sendreq->req_send.req_base.req_count,
|
||||
sendreq->req_send.req_base.req_datatype);
|
||||
);
|
||||
mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t),
|
||||
mca_bml_base_prepare_src(bml_btl, NULL,
|
||||
&sendreq->req_send.req_base.req_convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t),
|
||||
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des);
|
||||
MEMCHECKER(
|
||||
memchecker_call(&opal_memchecker_base_mem_noaccess,
|
||||
@ -1006,9 +1046,12 @@ cannot_pack:
|
||||
des->des_cbdata = sendreq;
|
||||
|
||||
/* setup header */
|
||||
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval;
|
||||
mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
|
||||
sendreq->req_recv.lval);
|
||||
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval;
|
||||
hdr->hdr_common.hdr_flags = 0;
|
||||
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG;
|
||||
hdr->hdr_frag_offset = range->range_send_offset;
|
||||
hdr->hdr_src_req.pval = sendreq;
|
||||
hdr->hdr_dst_req = sendreq->req_recv;
|
||||
|
||||
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
|
||||
sendreq->req_send.req_base.req_proc);
|
||||
@ -1065,66 +1108,38 @@ cannot_pack:
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A put fragment could not be started. Queue the fragment to be retried later or
|
||||
* fall back on send/recv.
|
||||
*/
|
||||
static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
|
||||
{
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
|
||||
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
/* queue the frag for later if there was a resource error */
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
} else {
|
||||
/* tell receiver to deregister memory */
|
||||
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
|
||||
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
|
||||
|
||||
/* send fragment by copy in/out */
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
|
||||
frag->rdma_length);
|
||||
/* if a pointer to a receive request is not set it means that
|
||||
* ACK was not yet received. Don't schedule sends before ACK */
|
||||
if (NULL != sendreq->req_recv.pval)
|
||||
mca_pml_ob1_send_request_schedule (sendreq);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An RDMA put operation has completed:
|
||||
* (1) Update request status and if required set completed
|
||||
* (2) Send FIN control message to the destination
|
||||
* (2) Send FIN control message to the destination
|
||||
*/
|
||||
|
||||
static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *cbdata, int status)
|
||||
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
{
|
||||
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
|
||||
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
|
||||
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
|
||||
|
||||
/* check completion status */
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
|
||||
/* TODO -- readd ordering */
|
||||
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
|
||||
0, 0);
|
||||
|
||||
/* check for request completion */
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
|
||||
|
||||
send_request_pml_complete_check(sendreq);
|
||||
} else {
|
||||
/* try to fall back on send/recv */
|
||||
mca_pml_ob1_send_request_put_frag_failed (frag, status);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(status);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
|
||||
bml_btl,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_des,
|
||||
des->order, 0);
|
||||
|
||||
/* check for request completion */
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
|
||||
|
||||
send_request_pml_complete_check(sendreq);
|
||||
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
|
||||
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
@ -1132,45 +1147,81 @@ static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_b
|
||||
|
||||
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
|
||||
{
|
||||
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
||||
mca_btl_base_registration_handle_t *local_handle = NULL;
|
||||
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
|
||||
mca_mpool_base_registration_t *reg = NULL;
|
||||
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
||||
mca_btl_base_descriptor_t *des;
|
||||
size_t save_size = frag->rdma_length;
|
||||
int rc;
|
||||
|
||||
if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
|
||||
/* Check if the segment is already registered */
|
||||
for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
|
||||
if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
|
||||
/* do not copy the handle to the fragment to avoid deregistring it twice */
|
||||
local_handle = sendreq->req_rdma[i].btl_reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (OPAL_LIKELY(NULL == sendreq->src_des)) {
|
||||
/* setup descriptor */
|
||||
mca_bml_base_prepare_src( bml_btl,
|
||||
reg,
|
||||
&frag->convertor,
|
||||
MCA_BTL_NO_ORDER,
|
||||
0,
|
||||
&frag->rdma_length,
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||||
MCA_BTL_DES_FLAGS_PUT,
|
||||
&des );
|
||||
|
||||
if( OPAL_UNLIKELY(NULL == des) ) {
|
||||
if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
|
||||
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
|
||||
frag->rdma_length = save_size;
|
||||
opal_convertor_set_position(&frag->convertor, &offset);
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
} else {
|
||||
mca_pml_ob1_send_request_t *sendreq =
|
||||
(mca_pml_ob1_send_request_t*)frag->rdma_req;
|
||||
|
||||
if (NULL == frag->local_handle) {
|
||||
/* Not already registered. Register the region with the BTL. */
|
||||
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
|
||||
&frag->local_handle);
|
||||
/* tell receiver to unregister memory */
|
||||
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
|
||||
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
|
||||
MCA_BTL_NO_ORDER, 1);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
|
||||
mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
/* send fragment by copy in/out */
|
||||
mca_pml_ob1_send_request_copy_in_out(sendreq,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
|
||||
/* if a pointer to a receive request is not set it means that
|
||||
* ACK was not yet received. Don't schedule sends before ACK */
|
||||
if(NULL != sendreq->req_recv.pval)
|
||||
mca_pml_ob1_send_request_schedule(sendreq);
|
||||
}
|
||||
|
||||
local_handle = frag->local_handle;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
} else {
|
||||
/* already have a source descriptor */
|
||||
des = sendreq->src_des;
|
||||
sendreq->src_des = NULL;
|
||||
}
|
||||
|
||||
des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
|
||||
des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
|
||||
des->des_cbfunc = mca_pml_ob1_put_completion;
|
||||
des->des_cbdata = frag;
|
||||
|
||||
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
|
||||
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
|
||||
|
||||
rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
|
||||
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
|
||||
0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag);
|
||||
rc = mca_bml_base_put(bml_btl, des);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||
mca_pml_ob1_send_request_put_frag_failed (frag, rc);
|
||||
return rc;
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
frag->rdma_length = save_size;
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
} else {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(rc);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -1184,11 +1235,12 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
|
||||
*/
|
||||
|
||||
void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_ob1_rdma_hdr_t* hdr )
|
||||
{
|
||||
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
|
||||
mca_pml_ob1_rdma_frag_t* frag;
|
||||
size_t i, size = 0;
|
||||
|
||||
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
|
||||
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
|
||||
@ -1196,36 +1248,61 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
|
||||
|
||||
sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
|
||||
|
||||
if (NULL == sendreq->rdma_frag) {
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
|
||||
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
} else {
|
||||
/* rget fallback on put */
|
||||
frag = sendreq->rdma_frag;
|
||||
sendreq->rdma_frag = NULL;
|
||||
sendreq->req_state = 0;
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
/* TSW - FIX */
|
||||
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||
ompi_rte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
/* copy registration data */
|
||||
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
|
||||
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
|
||||
|
||||
/* setup fragment */
|
||||
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
|
||||
|
||||
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
|
||||
mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size);
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
|
||||
(ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
size += opal_swap_bytes4(seg->seg_len);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
size += seg->seg_len;
|
||||
}
|
||||
}
|
||||
|
||||
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
||||
frag->rdma_hdr.hdr_rdma = *hdr;
|
||||
frag->rdma_req = sendreq;
|
||||
frag->rdma_length = hdr->hdr_dst_size;
|
||||
frag->rdma_ep = bml_endpoint;
|
||||
frag->rdma_length = size;
|
||||
frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
|
||||
frag->remote_address = hdr->hdr_dst_ptr;
|
||||
frag->reg = NULL;
|
||||
frag->retries = 0;
|
||||
|
||||
/* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
|
||||
* non-contiguous RDMA. If that changes this code will be wrong. */
|
||||
opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
|
||||
hdr->hdr_rdma_offset, &frag->local_address);
|
||||
if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
|
||||
/* get fallback path */
|
||||
sendreq->req_state = 0;
|
||||
}
|
||||
|
||||
/* lookup the corresponding registration */
|
||||
for(i=0; i<sendreq->req_rdma_cnt; i++) {
|
||||
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
|
||||
frag->reg = sendreq->req_rdma[i].btl_reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* RDMA writes may proceed in parallel to send and to each other, so
|
||||
* create clone of the convertor for each RDMA fragment
|
||||
*/
|
||||
size = hdr->hdr_rdma_offset;
|
||||
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
|
||||
&frag->convertor, 0, &size);
|
||||
|
||||
mca_pml_ob1_send_request_put_frag(frag);
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t {
|
||||
mca_pml_ob1_send_pending_t req_pending;
|
||||
opal_mutex_t req_send_range_lock;
|
||||
opal_list_t req_send_ranges;
|
||||
mca_pml_ob1_rdma_frag_t *rdma_frag;
|
||||
mca_btl_base_descriptor_t *src_des;
|
||||
mca_pml_ob1_com_btl_t req_rdma[1];
|
||||
};
|
||||
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
|
||||
@ -124,9 +124,10 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
|
||||
ompi_free_list_item_t* item; \
|
||||
\
|
||||
if( OPAL_LIKELY(NULL != proc) ) { \
|
||||
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
|
||||
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
|
||||
sendreq = (mca_pml_ob1_send_request_t*)item; \
|
||||
sendreq->req_send.req_base.req_proc = proc; \
|
||||
sendreq->src_des = NULL; \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -162,18 +163,15 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
|
||||
assert( 0 == _position ); \
|
||||
}
|
||||
|
||||
static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq)
|
||||
static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq)
|
||||
{
|
||||
size_t r;
|
||||
|
||||
/* return mpool resources */
|
||||
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
|
||||
struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
|
||||
mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
|
||||
|
||||
if (NULL != handle) {
|
||||
mca_bml_base_deregister_mem (bml_btl, handle);
|
||||
sendreq->req_rdma[r].btl_reg = NULL;
|
||||
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
|
||||
if( NULL != reg && reg->mpool != NULL ) {
|
||||
reg->mpool->mpool_deregister(reg->mpool, reg);
|
||||
}
|
||||
}
|
||||
sendreq->req_rdma_cnt = 0;
|
||||
@ -220,14 +218,10 @@ do {
|
||||
|
||||
#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
|
||||
do { \
|
||||
/* Let the base handle the reference counts */ \
|
||||
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
|
||||
if (sendreq->rdma_frag) { \
|
||||
MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \
|
||||
sendreq->rdma_frag = NULL; \
|
||||
} \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
|
||||
(ompi_free_list_item_t*)sendreq); \
|
||||
/* Let the base handle the reference counts */ \
|
||||
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
|
||||
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
|
||||
(ompi_free_list_item_t*)sendreq); \
|
||||
} while(0)
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -217,14 +217,6 @@ static inline void opal_convertor_get_current_pointer( const opal_convertor_t* p
|
||||
*position = (void*)base;
|
||||
}
|
||||
|
||||
static inline void opal_convertor_get_offset_pointer( const opal_convertor_t* pConv,
|
||||
size_t offset, void** position )
|
||||
{
|
||||
unsigned char* base = pConv->pBaseBuf + offset + pConv->pDesc->true_lb;
|
||||
*position = (void*)base;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
|
@ -36,8 +36,10 @@ mca_btl_active_message_callback_t mca_btl_base_active_message_trigger[MCA_BTL_TA
|
||||
|
||||
static void mca_btl_base_descriptor_constructor(mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
des->des_segments = NULL;
|
||||
des->des_segment_count = 0;
|
||||
des->des_local = NULL;
|
||||
des->des_local_count = 0;
|
||||
des->des_remote = NULL;
|
||||
des->des_remote_count = 0;
|
||||
des->des_cbfunc = NULL;
|
||||
des->des_cbdata = NULL;
|
||||
des->des_flags = 0;
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -46,15 +45,13 @@ int mca_btl_base_param_register(mca_base_component_t *version,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&module->btl_exclusivity);
|
||||
|
||||
asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, HETEROGENEOUS_RDMA=%d, "
|
||||
"ATOMIC_OPS=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, "
|
||||
"RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)",
|
||||
asprintf(&msg, "BTL bit flags (general flags: SEND=%d, PUT=%d, GET=%d, SEND_INPLACE=%d, RDMA_MATCHED=%d, HETEROGENEOUS_RDMA=%d; flags only used by the \"dr\" PML (ignored by others): ACK=%d, CHECKSUM=%d, RDMA_COMPLETION=%d; flags only used by the \"bfo\" PML (ignored by others): FAILOVER_SUPPORT=%d)",
|
||||
MCA_BTL_FLAGS_SEND,
|
||||
MCA_BTL_FLAGS_PUT,
|
||||
MCA_BTL_FLAGS_GET,
|
||||
MCA_BTL_FLAGS_SEND_INPLACE,
|
||||
MCA_BTL_FLAGS_RDMA_MATCHED,
|
||||
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA,
|
||||
MCA_BTL_FLAGS_ATOMIC_OPS,
|
||||
MCA_BTL_FLAGS_NEED_ACK,
|
||||
MCA_BTL_FLAGS_NEED_CSUM,
|
||||
MCA_BTL_FLAGS_RDMA_COMPLETION,
|
||||
@ -66,14 +63,6 @@ int mca_btl_base_param_register(mca_base_component_t *version,
|
||||
&module->btl_flags);
|
||||
free(msg);
|
||||
|
||||
asprintf (&msg, "BTL atomic bit flags (general flags: ADD=%d, AND=%d, OR=%d, XOR=%d",
|
||||
MCA_BTL_ATOMIC_SUPPORTS_ADD, MCA_BTL_ATOMIC_SUPPORTS_AND, MCA_BTL_ATOMIC_SUPPORTS_OR,
|
||||
MCA_BTL_ATOMIC_SUPPORTS_XOR);
|
||||
(void) mca_base_component_var_register(version, "atomic_flags", msg, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_atomic_flags);
|
||||
free(msg);
|
||||
|
||||
(void) mca_base_component_var_register(version, "rndv_eager_limit", "Size (in bytes, including header) of \"phase 1\" fragment sent for all large messages (must be >= 0 and <= eager_limit)",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_4,
|
||||
@ -85,39 +74,6 @@ int mca_btl_base_param_register(mca_base_component_t *version,
|
||||
OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&module->btl_eager_limit);
|
||||
|
||||
if ((module->btl_flags & MCA_BTL_FLAGS_GET) && module->btl_get) {
|
||||
if (0 == module->btl_get_limit) {
|
||||
module->btl_get_limit = SIZE_MAX;
|
||||
}
|
||||
|
||||
(void) mca_base_component_var_register(version, "get_limit", "Maximum size (in bytes) for btl get",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_get_limit);
|
||||
|
||||
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
|
||||
* function. */
|
||||
(void) mca_base_component_var_register(version, "get_alignment", "Alignment required for btl get",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_get_alignment);
|
||||
}
|
||||
|
||||
if ((module->btl_flags & MCA_BTL_FLAGS_PUT) && module->btl_put) {
|
||||
if (0 == module->btl_put_limit) {
|
||||
module->btl_put_limit = SIZE_MAX;
|
||||
}
|
||||
(void) mca_base_component_var_register(version, "put_limit", "Maximum size (in bytes) for btl put",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &module->btl_put_limit);
|
||||
|
||||
/* Allow the user to set the alignment. The BTL should double-check the alignment in its open
|
||||
* function. */
|
||||
(void) mca_base_component_var_register(version, "put_alignment", "Alignment required for btl put",
|
||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_6,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT, &module->btl_put_alignment);
|
||||
}
|
||||
|
||||
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
/* If no CUDA RDMA support, zero them out */
|
||||
if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
|
||||
@ -188,17 +144,5 @@ int mca_btl_base_param_verify(mca_btl_base_module_t *module)
|
||||
module->btl_flags &= ~MCA_BTL_FLAGS_GET;
|
||||
}
|
||||
|
||||
if (0 == module->btl_atomic_flags) {
|
||||
module->btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_OPS;
|
||||
}
|
||||
|
||||
if (0 == module->btl_get_limit) {
|
||||
module->btl_get_limit = SIZE_MAX;
|
||||
}
|
||||
|
||||
if (0 == module->btl_put_limit) {
|
||||
module->btl_put_limit = SIZE_MAX;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -134,23 +134,6 @@ struct mca_btl_base_descriptor_t;
|
||||
struct mca_mpool_base_resources_t;
|
||||
struct opal_proc_t;
|
||||
|
||||
/**
|
||||
* Opaque registration handle for executing RDMA and atomic
|
||||
* operations on a memory region.
|
||||
*
|
||||
* This data inside this handle is appropriate for passing
|
||||
* to remote peers to execute RDMA and atomic operations. The
|
||||
* size needed to send the registration handle can be
|
||||
* obtained from the btl via the btl_registration_handle_size
|
||||
* member. If this size is 0 then no registration data is
|
||||
* needed to execute RDMA or atomic operations.
|
||||
*/
|
||||
struct mca_btl_base_registration_handle_t;
|
||||
typedef struct mca_btl_base_registration_handle_t mca_btl_base_registration_handle_t;
|
||||
|
||||
|
||||
/* Wildcard endpoint for use in the register_mem function */
|
||||
#define MCA_BTL_ENDPOINT_ANY (struct mca_btl_base_endpoint_t *) -1
|
||||
|
||||
/* send/recv operations require tag matching */
|
||||
typedef uint8_t mca_btl_base_tag_t;
|
||||
@ -190,9 +173,6 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
#define MCA_BTL_FLAGS_SEND 0x0001
|
||||
#define MCA_BTL_FLAGS_PUT 0x0002
|
||||
#define MCA_BTL_FLAGS_GET 0x0004
|
||||
/* btls that set the MCA_BTL_FLAGS_RDMA will always get added to the BML
|
||||
* rdma_btls list. This allows the updated one-sided component to
|
||||
* use btls that are not otherwise used for send/recv. */
|
||||
#define MCA_BTL_FLAGS_RDMA (MCA_BTL_FLAGS_GET|MCA_BTL_FLAGS_PUT)
|
||||
|
||||
/* btl can send directly from user buffer w/out registration */
|
||||
@ -229,12 +209,6 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
*/
|
||||
#define MCA_BTL_FLAGS_SIGNALED 0x4000
|
||||
|
||||
|
||||
/** The BTL supports network atomic operations */
|
||||
#define MCA_BTL_FLAGS_ATOMIC_OPS 0x08000
|
||||
/** The BTL supports fetching network atomic operations */
|
||||
#define MCA_BTL_FLAGS_ATOMIC_FOPS 0x10000
|
||||
|
||||
/* Default exclusivity levels */
|
||||
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
|
||||
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
|
||||
@ -245,62 +219,6 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2
|
||||
#define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4
|
||||
|
||||
/** registration flags */
|
||||
enum {
|
||||
/** Allow local write on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the local handle for a
|
||||
* btl_get operation. */
|
||||
MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x00000001,
|
||||
/** Allow remote read on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the remote handle for a
|
||||
* btl_get operation. */
|
||||
MCA_BTL_REG_FLAG_REMOTE_READ = 0x00000002,
|
||||
/** Allow remote write on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the remote handle for a
|
||||
* btl_put operation. */
|
||||
MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x00000004,
|
||||
/** Allow remote atomic operations on the registered region. If a region is
|
||||
* registered with this flag the registration can be used as the remote
|
||||
* handle for a btl_atomic_op or btl_atomic_fop operation. */
|
||||
MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x00000008,
|
||||
/** Allow any btl operation on the registered region. If a region is registered
|
||||
* with this flag the registration can be used as the local or remote handle for
|
||||
* any btl operation. */
|
||||
MCA_BTL_REG_FLAG_ACCESS_ANY = 0x0000000f,
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
/** Region is in GPU memory */
|
||||
MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000,
|
||||
#endif
|
||||
};
|
||||
|
||||
/** supported atomic operations */
|
||||
enum {
|
||||
/** The btl supports atomic add */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_ADD = 0x00000001,
|
||||
/** The btl supports atomic bitwise and */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_AND = 0x00000200,
|
||||
/** The btl supports atomic bitwise or */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400,
|
||||
/** The btl supports atomic bitwise exclusive or */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800,
|
||||
/** The btl supports atomic compare-and-swap */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000,
|
||||
/** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */
|
||||
MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000,
|
||||
};
|
||||
|
||||
enum mca_btl_base_atomic_op_t {
|
||||
/** Atomic add: (*remote_address) = (*remote_address) + operand */
|
||||
MCA_BTL_ATOMIC_ADD = 0x0001,
|
||||
/** Atomic and: (*remote_address) = (*remote_address) & operand */
|
||||
MCA_BTL_ATOMIC_AND = 0x0011,
|
||||
/** Atomic or: (*remote_address) = (*remote_address) | operand */
|
||||
MCA_BTL_ATOMIC_OR = 0x0012,
|
||||
/** Atomic xor: (*remote_address) = (*remote_address) ^ operand */
|
||||
MCA_BTL_ATOMIC_XOR = 0x0014,
|
||||
};
|
||||
typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t;
|
||||
|
||||
/**
|
||||
* Asynchronous callback function on completion of an operation.
|
||||
* Completion Semantics: The descriptor can be reused or returned to the
|
||||
@ -319,32 +237,6 @@ typedef void (*mca_btl_base_completion_fn_t)(
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
int status);
|
||||
|
||||
|
||||
/**
|
||||
* Asynchronous callback function on completion of an rdma or atomic operation.
|
||||
* Completion Semantics: The rdma or atomic memory operation has completed
|
||||
* remotely (i.e.) is remotely visible and the caller is free to deregister
|
||||
* the local_handle or modify the memory in local_address.
|
||||
*
|
||||
* @param[IN] module the BTL module
|
||||
* @param[IN] endpoint the BTL endpoint
|
||||
* @param[IN] local_address local address for the operation (if any)
|
||||
* @param[IN] local_handle local handle associated with the local_address
|
||||
* @param[IN] context callback context supplied to the rdma/atomic operation
|
||||
* @param[IN] cbdata callback data supplied to the rdma/atomic operation
|
||||
* @param[IN] status status of the operation
|
||||
*
|
||||
*/
|
||||
typedef void (*mca_btl_base_rdma_completion_fn_t)(
|
||||
struct mca_btl_base_module_t* module,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
void *local_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context,
|
||||
void *cbdata,
|
||||
int status);
|
||||
|
||||
|
||||
/**
|
||||
* Describes a region/segment of memory that is addressable
|
||||
* by an BTL.
|
||||
@ -370,19 +262,20 @@ struct mca_btl_base_segment_t {
|
||||
};
|
||||
typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
|
||||
|
||||
|
||||
/**
|
||||
* A descriptor that holds the parameters to a send/put/get
|
||||
* operation along w/ a callback routine that is called on
|
||||
* completion of the request.
|
||||
* Note: receive callbacks will store the incomming data segments in
|
||||
* des_segments
|
||||
* des_local
|
||||
*/
|
||||
|
||||
struct mca_btl_base_descriptor_t {
|
||||
ompi_free_list_item_t super;
|
||||
mca_btl_base_segment_t *des_segments; /**< local segments */
|
||||
size_t des_segment_count; /**< number of local segments */
|
||||
mca_btl_base_segment_t *des_local; /**< local segments */
|
||||
size_t des_local_count; /**< number of local segments */
|
||||
mca_btl_base_segment_t *des_remote; /**< remote segments */
|
||||
size_t des_remote_count; /**< number of destination segments */
|
||||
mca_btl_base_completion_fn_t des_cbfunc; /**< local callback function */
|
||||
void* des_cbdata; /**< opaque callback data */
|
||||
void* des_context; /**< more opaque callback data */
|
||||
@ -436,11 +329,6 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
|
||||
*/
|
||||
#define MCA_BTL_SEG_MAX_SIZE 256
|
||||
|
||||
/**
|
||||
* Maximum size of a BTL registration handle in bytes
|
||||
*/
|
||||
#define MCA_BTL_REG_HANDLE_MAX_SIZE 256
|
||||
|
||||
/*
|
||||
* BTL base header, stores the tag at a minimum
|
||||
*/
|
||||
@ -507,7 +395,7 @@ typedef int (*mca_btl_base_component_progress_fn_t)(void);
|
||||
* completion function, this implies that all data payload in the
|
||||
* mca_btl_base_descriptor_t must be copied out within this callback or
|
||||
* forfeited back to the BTL.
|
||||
* Note also that descriptor segments (des_segments) must be base
|
||||
* Note also that descriptor segments (des_local) must be base
|
||||
* segments for all callbacks.
|
||||
*
|
||||
* @param[IN] btl BTL module
|
||||
@ -759,6 +647,7 @@ typedef int (*mca_btl_base_module_free_fn_t)(
|
||||
typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -766,43 +655,6 @@ typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)(
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Register a memory region for put/get/atomic operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
|
||||
* @param base (IN) Pointer to start of region
|
||||
* @param size (IN) Size of region
|
||||
* @param flags (IN) Flags indicating what operation will be performed. Valid
|
||||
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
|
||||
* and MCA_BTL_DES_FLAGS_ATOMIC
|
||||
*
|
||||
* @returns a memory registration handle valid for both local and remote operations
|
||||
* @returns NULL if the region could not be registered
|
||||
*
|
||||
* This function registers the specified region with the hardware for use with
|
||||
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
|
||||
* functions. Care should be taken to not hold an excessive number of registrations
|
||||
* as they may use limited system/NIC resources.
|
||||
*/
|
||||
typedef struct mca_btl_base_registration_handle_t *(*mca_btl_base_module_register_mem_fn_t)(
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags);
|
||||
|
||||
/**
|
||||
* @brief Deregister a memory region
|
||||
*
|
||||
* @param btl (IN) BTL module region was registered with
|
||||
* @param handle (IN) BTL registration handle to deregister
|
||||
*
|
||||
* This function deregisters the memory region associated with the specified handle. Care
|
||||
* should be taken to not perform any RDMA or atomic operation on this memory region
|
||||
* after it is deregistered. It is erroneous to specify a memory handle associated with
|
||||
* a remote node.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_deregister_mem_fn_t)(
|
||||
struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous send.
|
||||
* Completion Semantics: the descriptor has been queued for a send operation
|
||||
@ -846,8 +698,7 @@ typedef int (*mca_btl_base_module_send_fn_t)(
|
||||
* @param flags (IN) Flags.
|
||||
* @param tag (IN) The tag value used to notify the peer.
|
||||
* @param descriptor (OUT) The descriptor to be returned unable to be sent immediately
|
||||
* (may be NULL).
|
||||
*
|
||||
|
||||
* @retval OPAL_SUCCESS The send was successfully queued
|
||||
* @retval OPAL_ERROR The send failed
|
||||
* @retval OPAL_ERR_UNREACH The endpoint is not reachable
|
||||
@ -871,210 +722,58 @@ typedef int (*mca_btl_base_module_sendi_fn_t)(
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the put operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
* Completion Semantics: the descriptor has been queued for a put operation
|
||||
* the BTL now controls the descriptor until local
|
||||
* completion callback is made on the descriptor
|
||||
*
|
||||
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
|
||||
* allow multiple concurrent put operations on the same descriptor.
|
||||
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
|
||||
* a corresponding prepare_src/dst call for each put operation and
|
||||
* therefore prohibit multiple concurrent put operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
typedef int (*mca_btl_base_module_put_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the get operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
* Completion Semantics: the descriptor has been queued for a get operation
|
||||
* the BTL now controls the descriptor until local
|
||||
* completion callback is made on the descriptor
|
||||
*
|
||||
* BTLs that do not have the MCA_BTL_FLAGS_RDMA_MATCHED flag set
|
||||
* allow multiple concurrent get operations on the same descriptor.
|
||||
* BTLs that do have the MCA_BTL_FLAGS_RDMA_MATCHED flag set require
|
||||
* a corresponding prepare_src/dst call for each get operation and
|
||||
* therefore prohibit multiple concurrent get operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a get
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a get
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous atomic operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param op (IN) Operation to perform
|
||||
* @param operand (IN) Operand for the operation
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
typedef int (*mca_btl_base_module_get_fn_t)(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous fetching atomic operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param op (IN) Operation to perform
|
||||
* @param operand (IN) Operand for the operation
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous compare and swap operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param compare (IN) Operand for the operation
|
||||
* @param value (IN) Value to store on success
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with {value} if *remote_address == compare.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
typedef int (*mca_btl_base_module_atomic_cswap_fn_t) (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare,
|
||||
uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Diagnostic dump of btl state.
|
||||
@ -1114,14 +813,7 @@ struct mca_btl_base_module_t {
|
||||
uint32_t btl_latency; /**< relative ranking of latency used to prioritize btls */
|
||||
uint32_t btl_bandwidth; /**< bandwidth (Mbytes/sec) supported by each endpoint */
|
||||
uint32_t btl_flags; /**< flags (put/get...) */
|
||||
uint32_t btl_atomic_flags; /**< atomic operations supported (add, and, xor, etc) */
|
||||
size_t btl_registration_handle_size; /**< size of the BTLs registration handles */
|
||||
|
||||
/* One-sided limitations (0 for no alignment, SIZE_MAX for no limit ) */
|
||||
size_t btl_get_limit; /**< maximum size supported by the btl_get function */
|
||||
size_t btl_get_alignment; /**< minimum alignment/size needed by btl_get (power of 2) */
|
||||
size_t btl_put_limit; /**< maximum size supported by the btl_put function */
|
||||
size_t btl_put_alignment; /**< minimum alignment/size needed by btl_put (power of 2) */
|
||||
size_t btl_seg_size; /**< size of a btl segment */
|
||||
|
||||
/* BTL function table */
|
||||
mca_btl_base_module_add_procs_fn_t btl_add_procs;
|
||||
@ -1132,21 +824,13 @@ struct mca_btl_base_module_t {
|
||||
mca_btl_base_module_alloc_fn_t btl_alloc;
|
||||
mca_btl_base_module_free_fn_t btl_free;
|
||||
mca_btl_base_module_prepare_fn_t btl_prepare_src;
|
||||
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
|
||||
mca_btl_base_module_send_fn_t btl_send;
|
||||
mca_btl_base_module_sendi_fn_t btl_sendi;
|
||||
mca_btl_base_module_put_fn_t btl_put;
|
||||
mca_btl_base_module_get_fn_t btl_get;
|
||||
mca_btl_base_module_dump_fn_t btl_dump;
|
||||
|
||||
/* atomic operations */
|
||||
mca_btl_base_module_atomic_op64_fn_t btl_atomic_op;
|
||||
mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop;
|
||||
mca_btl_base_module_atomic_cswap_fn_t btl_atomic_cswap;
|
||||
|
||||
/* new memory registration functions */
|
||||
mca_btl_base_module_register_mem_fn_t btl_register_mem; /**< memory registration function (NULL if not needed) */
|
||||
mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; /**< memory deregistration function (NULL if not needed) */
|
||||
|
||||
|
||||
/** the mpool associated with this btl (optional) */
|
||||
mca_mpool_base_module_t* btl_mpool;
|
||||
/** register a default error handler */
|
||||
|
@ -59,9 +59,6 @@ sources = \
|
||||
btl_openib_fd.c \
|
||||
btl_openib_ip.h \
|
||||
btl_openib_ip.c \
|
||||
btl_openib_put.c \
|
||||
btl_openib_get.c \
|
||||
btl_openib_atomic.c \
|
||||
connect/base.h \
|
||||
connect/btl_openib_connect_base.c \
|
||||
connect/btl_openib_connect_empty.c \
|
||||
|
@ -91,11 +91,6 @@
|
||||
#define MIN(a,b) ((a)<(b)?(a):(b))
|
||||
#endif
|
||||
|
||||
static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags);
|
||||
static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
.super = {
|
||||
.btl_component = &mca_btl_openib_component.super,
|
||||
@ -106,19 +101,14 @@ mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
.btl_alloc = mca_btl_openib_alloc,
|
||||
.btl_free = mca_btl_openib_free,
|
||||
.btl_prepare_src = mca_btl_openib_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_openib_prepare_dst,
|
||||
.btl_send = mca_btl_openib_send,
|
||||
.btl_sendi = mca_btl_openib_sendi, /* send immediate */
|
||||
.btl_put = mca_btl_openib_put,
|
||||
.btl_get = mca_btl_openib_get,
|
||||
.btl_dump = mca_btl_base_dump,
|
||||
.btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */
|
||||
.btl_ft_event = mca_btl_openib_ft_event,
|
||||
.btl_register_mem = mca_btl_openib_register_mem,
|
||||
.btl_deregister_mem = mca_btl_openib_deregister_mem,
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
.btl_atomic_fop = mca_btl_openib_atomic_fop,
|
||||
.btl_atomic_cswap = mca_btl_openib_atomic_cswap,
|
||||
#endif
|
||||
.btl_ft_event = mca_btl_openib_ft_event
|
||||
}
|
||||
};
|
||||
|
||||
@ -524,12 +514,10 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
if a user distributes different INI files or parameters for different node/procs,
|
||||
it is on his own responsibility */
|
||||
switch(mca_btl_openib_component.receive_queues_source) {
|
||||
case MCA_BASE_VAR_SOURCE_COMMAND_LINE:
|
||||
case MCA_BASE_VAR_SOURCE_ENV:
|
||||
case MCA_BASE_VAR_SOURCE_FILE:
|
||||
case MCA_BASE_VAR_SOURCE_SET:
|
||||
case MCA_BASE_VAR_SOURCE_OVERRIDE:
|
||||
break;
|
||||
case MCA_BASE_VAR_SOURCE_COMMAND_LINE:
|
||||
case MCA_BASE_VAR_SOURCE_ENV:
|
||||
case MCA_BASE_VAR_SOURCE_MAX:
|
||||
break;
|
||||
|
||||
/* If the queues configuration was set from command line
|
||||
(with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */
|
||||
@ -538,38 +526,40 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
not possible that remote side got its queues configuration from command line =>
|
||||
(by prio) the configuration was set from INI file or (if not configure)
|
||||
by default queues configuration */
|
||||
case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
|
||||
if(NULL != values.receive_queues) {
|
||||
recv_qps = values.receive_queues;
|
||||
} else {
|
||||
recv_qps = mca_btl_openib_component.default_recv_qps;
|
||||
}
|
||||
case MCA_BASE_VAR_SOURCE_FILE:
|
||||
case MCA_BASE_VAR_SOURCE_SET:
|
||||
case MCA_BASE_VAR_SOURCE_OVERRIDE:
|
||||
if(NULL != values.receive_queues) {
|
||||
recv_qps = values.receive_queues;
|
||||
} else {
|
||||
recv_qps = mca_btl_openib_component.default_recv_qps;
|
||||
}
|
||||
|
||||
if(0 != strcmp(mca_btl_openib_component.receive_queues,
|
||||
recv_qps)) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"unsupported queues configuration", true,
|
||||
opal_process_info.nodename,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||
mca_btl_openib_component.receive_queues,
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
||||
endpoint->rem_info.rem_vendor_id,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
recv_qps);
|
||||
if(0 != strcmp(mca_btl_openib_component.receive_queues,
|
||||
recv_qps)) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"unsupported queues configuration", true,
|
||||
opal_process_info.nodename,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||
mca_btl_openib_component.receive_queues,
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
|
||||
endpoint->rem_info.rem_vendor_id,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
recv_qps);
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
break;
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
break;
|
||||
|
||||
/* If the local queues configuration was set
|
||||
by default queues => check all possible cases for remote side and compare */
|
||||
case MCA_BASE_VAR_SOURCE_DEFAULT:
|
||||
if(NULL != values.receive_queues) {
|
||||
if(0 != strcmp(mca_btl_openib_component.receive_queues,
|
||||
values.receive_queues)) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
case MCA_BASE_VAR_SOURCE_DEFAULT:
|
||||
if(NULL != values.receive_queues) {
|
||||
if(0 != strcmp(mca_btl_openib_component.receive_queues,
|
||||
values.receive_queues)) {
|
||||
opal_show_help("help-mpi-btl-openib.txt",
|
||||
"unsupported queues configuration", true,
|
||||
opal_process_info.nodename,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
@ -581,10 +571,10 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
values.receive_queues);
|
||||
|
||||
return OPAL_ERROR;
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
break;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -734,7 +724,7 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device)
|
||||
|
||||
#if OPAL_HAVE_THREADS
|
||||
if(mca_btl_openib_component.use_async_event_thread) {
|
||||
mca_btl_openib_async_cmd_t async_command;
|
||||
mca_btl_openib_async_cmd_t async_command;
|
||||
|
||||
/* start the async even thread if it is not already started */
|
||||
if (start_async_event_thread() != OPAL_SUCCESS)
|
||||
@ -742,8 +732,8 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device)
|
||||
|
||||
device->got_fatal_event = false;
|
||||
device->got_port_event = false;
|
||||
async_command.a_cmd = OPENIB_ASYNC_CMD_FD_ADD;
|
||||
async_command.fd = device->ib_dev_context->async_fd;
|
||||
async_command.a_cmd = OPENIB_ASYNC_CMD_FD_ADD;
|
||||
async_command.fd = device->ib_dev_context->async_fd;
|
||||
if (write(mca_btl_openib_component.async_pipe[1],
|
||||
&async_command, sizeof(mca_btl_openib_async_cmd_t))<0){
|
||||
BTL_ERROR(("Failed to write to pipe [%d]",errno));
|
||||
@ -958,12 +948,6 @@ int mca_btl_openib_add_procs(
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_size_queues(openib_btl, nprocs);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_ERROR(("error creating cqs"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
|
||||
struct opal_proc_t* proc = procs[i];
|
||||
mca_btl_openib_proc_t* ib_proc;
|
||||
@ -975,6 +959,11 @@ int mca_btl_openib_add_procs(
|
||||
local_procs ++;
|
||||
}
|
||||
|
||||
/* OOB, XOOB, and RDMACM do not support SELF comunication, so
|
||||
* mark the prco as unreachable by openib btl */
|
||||
if (0 == opal_compare_proc(OPAL_PROC_MY_NAME, proc->proc_name)) {
|
||||
continue;
|
||||
}
|
||||
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
|
||||
/* Most current iWARP adapters (June 2008) cannot handle
|
||||
talking to other processes on the same host (!) -- so mark
|
||||
@ -1144,7 +1133,7 @@ int mca_btl_openib_add_procs(
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
return mca_btl_openib_size_queues(openib_btl, nprocs);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1237,16 +1226,15 @@ ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order,
|
||||
|
||||
/* check if pending fragment has enough space for coalescing */
|
||||
static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
|
||||
opal_mutex_t *lock, struct mca_btl_base_endpoint_t *ep, size_t size,
|
||||
mca_btl_openib_coalesced_frag_t **cfrag)
|
||||
opal_mutex_t *lock, mca_btl_base_endpoint_t *ep, size_t size)
|
||||
{
|
||||
mca_btl_openib_send_frag_t *frag = NULL;
|
||||
|
||||
if (opal_list_is_empty(frag_list))
|
||||
if(opal_list_is_empty(frag_list))
|
||||
return NULL;
|
||||
|
||||
OPAL_THREAD_LOCK(lock);
|
||||
if (!opal_list_is_empty(frag_list)) {
|
||||
if(!opal_list_is_empty(frag_list)) {
|
||||
int qp;
|
||||
size_t total_length;
|
||||
opal_list_item_t *i = opal_list_get_first(frag_list);
|
||||
@ -1263,20 +1251,10 @@ static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
|
||||
|
||||
qp = to_base_frag(frag)->base.order;
|
||||
|
||||
if(total_length <= mca_btl_openib_component.qp_infos[qp].size) {
|
||||
/* make sure we can allocate a coalescing frag before returning success */
|
||||
*cfrag = alloc_coalesced_frag();
|
||||
if (OPAL_LIKELY(NULL != cfrag)) {
|
||||
(*cfrag)->send_frag = frag;
|
||||
(*cfrag)->sent = false;
|
||||
|
||||
opal_list_remove_first(frag_list);
|
||||
} else {
|
||||
frag = NULL;
|
||||
}
|
||||
} else {
|
||||
if(total_length <= mca_btl_openib_component.qp_infos[qp].size)
|
||||
opal_list_remove_first(frag_list);
|
||||
else
|
||||
frag = NULL;
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(lock);
|
||||
|
||||
@ -1303,7 +1281,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
|
||||
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
|
||||
int qp = frag_size_to_order(obtl, size);
|
||||
mca_btl_openib_send_frag_t *sfrag = NULL;
|
||||
mca_btl_openib_coalesced_frag_t *cfrag = NULL;
|
||||
mca_btl_openib_coalesced_frag_t *cfrag;
|
||||
|
||||
assert(qp != MCA_BTL_NO_ORDER);
|
||||
|
||||
@ -1312,25 +1290,26 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
|
||||
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
|
||||
|
||||
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
|
||||
&ep->endpoint_lock, ep, size, &cfrag);
|
||||
&ep->endpoint_lock, ep, size);
|
||||
|
||||
if (NULL == sfrag) {
|
||||
if(NULL == sfrag) {
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio],
|
||||
&ep->endpoint_lock, ep, size, &cfrag);
|
||||
&ep->endpoint_lock, ep, size);
|
||||
} else {
|
||||
sfrag = check_coalescing(
|
||||
&obtl->qps[qp].u.srq_qp.pending_frags[prio],
|
||||
&obtl->ib_lock, ep, size, &cfrag);
|
||||
&obtl->ib_lock, ep, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL == sfrag) {
|
||||
if(NULL == sfrag)
|
||||
return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags);
|
||||
}
|
||||
|
||||
/* begin coalescing message */
|
||||
cfrag = alloc_coalesced_frag();
|
||||
cfrag->send_frag = sfrag;
|
||||
|
||||
/* fix up new coalescing header if this is the first coalesced frag */
|
||||
if(sfrag->hdr != sfrag->chdr) {
|
||||
@ -1364,9 +1343,10 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
|
||||
to_base_frag(cfrag)->segment.base.seg_addr.pval = cfrag->hdr + 1;
|
||||
to_base_frag(cfrag)->segment.base.seg_len = size;
|
||||
|
||||
/* NTH: there is no reason to append the coalesced fragment here. No more
|
||||
* fragments will be added until either send or free has been called on
|
||||
* the coalesced frag. */
|
||||
/* save coalesced fragment on a main fragment; we will need it after send
|
||||
* completion to free it and to call upper layer callback */
|
||||
opal_list_append(&sfrag->coalesced_frags, (opal_list_item_t*)cfrag);
|
||||
sfrag->coalesced_length += (size+sizeof(mca_btl_openib_header_coalesced_t));
|
||||
|
||||
to_base_frag(cfrag)->base.des_flags = flags;
|
||||
|
||||
@ -1383,6 +1363,18 @@ int mca_btl_openib_free(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
/* is this fragment pointing at user memory? */
|
||||
if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) ||
|
||||
MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) {
|
||||
mca_btl_openib_com_frag_t* frag = to_com_frag(des);
|
||||
|
||||
if(frag->registration != NULL) {
|
||||
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
|
||||
(mca_mpool_base_registration_t*)frag->registration);
|
||||
frag->registration = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* reset those field on free so we will not have to do it on alloc */
|
||||
to_base_frag(des)->base.des_flags = 0;
|
||||
switch(openib_frag_type(des)) {
|
||||
@ -1398,18 +1390,15 @@ int mca_btl_openib_free(
|
||||
to_send_frag(des)->hdr + 1;
|
||||
assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags));
|
||||
/* fall through */
|
||||
case MCA_BTL_OPENIB_FRAG_RECV:
|
||||
case MCA_BTL_OPENIB_FRAG_RECV_USER:
|
||||
case MCA_BTL_OPENIB_FRAG_SEND_USER:
|
||||
to_base_frag(des)->base.des_remote = NULL;
|
||||
to_base_frag(des)->base.des_remote_count = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED && !to_coalesced_frag(des)->sent) {
|
||||
mca_btl_openib_send_frag_t *sfrag = to_coalesced_frag(des)->send_frag;
|
||||
|
||||
/* the coalesced fragment would have sent the original fragment but that
|
||||
* will not happen so send the fragment now */
|
||||
mca_btl_openib_endpoint_send(to_com_frag(sfrag)->endpoint, sfrag);
|
||||
}
|
||||
|
||||
MCA_BTL_IB_FRAG_RETURN(des);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -1441,6 +1430,7 @@ int mca_btl_openib_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -1448,6 +1438,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl;
|
||||
mca_btl_openib_reg_t *openib_reg;
|
||||
mca_btl_openib_com_frag_t *frag = NULL;
|
||||
struct iovec iov;
|
||||
uint32_t iov_count = 1;
|
||||
@ -1457,19 +1448,82 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
|
||||
openib_btl = (mca_btl_openib_module_t*)btl;
|
||||
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) {
|
||||
#else
|
||||
if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) {
|
||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||
/* GMS bloody HACK! */
|
||||
if(registration != NULL || max_data > btl->btl_max_send_size) {
|
||||
frag = alloc_send_user_frag();
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = NULL;
|
||||
|
||||
opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
|
||||
*size = max_data;
|
||||
|
||||
if(NULL == registration) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
|
||||
iov.iov_base, max_data, 0, ®istration);
|
||||
if(OPAL_SUCCESS != rc || NULL == registration) {
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
return NULL;
|
||||
}
|
||||
/* keep track of the registration we did */
|
||||
to_com_frag(frag)->registration =
|
||||
(mca_btl_openib_reg_t*)registration;
|
||||
}
|
||||
openib_reg = (mca_btl_openib_reg_t*)registration;
|
||||
|
||||
frag->sg_entry.length = max_data;
|
||||
frag->sg_entry.lkey = openib_reg->mr->lkey;
|
||||
frag->sg_entry.addr = (uint64_t)(uintptr_t)iov.iov_base;
|
||||
|
||||
to_base_frag(frag)->base.order = order;
|
||||
to_base_frag(frag)->base.des_flags = flags;
|
||||
to_base_frag(frag)->segment.base.seg_len = max_data;
|
||||
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base;
|
||||
to_base_frag(frag)->segment.key = frag->sg_entry.lkey;
|
||||
|
||||
assert(MCA_BTL_NO_ORDER == order);
|
||||
|
||||
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64,
|
||||
frag->sg_entry.lkey, frag->sg_entry.addr));
|
||||
|
||||
return &to_base_frag(frag)->base;
|
||||
}
|
||||
}
|
||||
|
||||
assert(MCA_BTL_NO_ORDER == order);
|
||||
|
||||
if (max_data + reserve > btl->btl_max_send_size) {
|
||||
if(max_data + reserve > btl->btl_max_send_size) {
|
||||
max_data = btl->btl_max_send_size - reserve;
|
||||
}
|
||||
|
||||
frag = (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc (btl, endpoint, order,
|
||||
max_data + reserve, flags);
|
||||
if (NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
if (OPAL_UNLIKELY(0 == reserve)) {
|
||||
frag = (mca_btl_openib_com_frag_t *) ib_frag_alloc(openib_btl, max_data, order, flags);
|
||||
if(NULL == frag)
|
||||
return NULL;
|
||||
|
||||
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
|
||||
/* NTH: this frag will be ue used for either a get or put so we need to set the lval to be
|
||||
consistent with the usage in get and put. the pval will be restored in mca_btl_openib_free */
|
||||
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
|
||||
to_base_frag(frag)->segment.base.seg_addr.lval =
|
||||
(uint64_t)(uintptr_t) ptr;
|
||||
} else {
|
||||
frag =
|
||||
(mca_btl_openib_com_frag_t *) mca_btl_openib_alloc(btl, endpoint, order,
|
||||
max_data + reserve, flags);
|
||||
if(NULL == frag)
|
||||
return NULL;
|
||||
|
||||
ptr = to_base_frag(frag)->segment.base.seg_addr.pval;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
|
||||
@ -1493,6 +1547,103 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
return &to_base_frag(frag)->base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare the dst buffer
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
* prepare dest's behavior depends on the following:
|
||||
* Has a valid memory registration been passed to prepare_src?
|
||||
* if so we attempt to use the pre-registered user-buffer, if the memory registration
|
||||
* is to small (only a portion of the user buffer) then we must reregister the user buffer
|
||||
* Has the user requested the memory to be left pinned?
|
||||
* if so we insert the memory registration into a memory tree for later lookup, we
|
||||
* may also remove a previous registration if a MRU (most recently used) list of
|
||||
* registrations is full, this prevents resources from being exhausted.
|
||||
*/
|
||||
mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl;
|
||||
mca_btl_openib_component_t *openib_component;
|
||||
mca_btl_openib_com_frag_t *frag;
|
||||
mca_btl_openib_reg_t *openib_reg;
|
||||
uint32_t max_msg_sz;
|
||||
int rc;
|
||||
void *buffer;
|
||||
|
||||
openib_btl = (mca_btl_openib_module_t*)btl;
|
||||
openib_component = (mca_btl_openib_component_t*)btl->btl_component;
|
||||
|
||||
frag = alloc_recv_user_frag();
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* max_msg_sz is the maximum message size of the HCA (hw limitation)
|
||||
set the minimum between local max_msg_sz and the remote */
|
||||
max_msg_sz = MIN(openib_btl->ib_port_attr.max_msg_sz,
|
||||
endpoint->endpoint_btl->ib_port_attr.max_msg_sz);
|
||||
|
||||
/* check if user has explicitly limited the max message size */
|
||||
if (openib_component->max_hw_msg_size > 0 &&
|
||||
max_msg_sz > (size_t)openib_component->max_hw_msg_size) {
|
||||
max_msg_sz = openib_component->max_hw_msg_size;
|
||||
}
|
||||
|
||||
/* limit the message so to max_msg_sz */
|
||||
if (*size > (size_t)max_msg_sz) {
|
||||
*size = (size_t)max_msg_sz;
|
||||
BTL_VERBOSE(("message size limited to %" PRIsize_t "\n", *size));
|
||||
}
|
||||
|
||||
opal_convertor_get_current_pointer(convertor, &buffer);
|
||||
|
||||
if(NULL == registration){
|
||||
/* we didn't get a memory registration passed in, so we have to
|
||||
* register the region ourselves
|
||||
*/
|
||||
uint32_t mflags = 0;
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
if (convertor->flags & CONVERTOR_CUDA) {
|
||||
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
|
||||
}
|
||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags,
|
||||
®istration);
|
||||
if(OPAL_SUCCESS != rc || NULL == registration) {
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
return NULL;
|
||||
}
|
||||
/* keep track of the registration we did */
|
||||
frag->registration = (mca_btl_openib_reg_t*)registration;
|
||||
}
|
||||
openib_reg = (mca_btl_openib_reg_t*)registration;
|
||||
|
||||
frag->sg_entry.length = *size;
|
||||
frag->sg_entry.lkey = openib_reg->mr->lkey;
|
||||
frag->sg_entry.addr = (uint64_t)(uintptr_t)buffer;
|
||||
|
||||
to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) buffer;
|
||||
to_base_frag(frag)->segment.base.seg_len = *size;
|
||||
to_base_frag(frag)->segment.key = openib_reg->mr->rkey;
|
||||
to_base_frag(frag)->base.order = order;
|
||||
to_base_frag(frag)->base.des_flags = flags;
|
||||
|
||||
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64 " "
|
||||
"rkey = %" PRIu32, frag->sg_entry.lkey, frag->sg_entry.addr,
|
||||
openib_reg->mr->rkey));
|
||||
|
||||
return &to_base_frag(frag)->base;
|
||||
}
|
||||
|
||||
static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) {
|
||||
mca_btl_openib_module_t* openib_btl;
|
||||
mca_btl_openib_endpoint_t* endpoint;
|
||||
@ -1645,15 +1796,16 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
{
|
||||
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
|
||||
size_t size = payload_size + header_size;
|
||||
size_t eager_limit;
|
||||
int qp = frag_size_to_order(obtl, size),
|
||||
prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY),
|
||||
ib_rc;
|
||||
int32_t cm_return;
|
||||
bool do_rdma = false;
|
||||
ompi_free_list_item_t* item = NULL;
|
||||
mca_btl_openib_frag_t *frag;
|
||||
mca_btl_openib_header_t *hdr;
|
||||
int send_signaled;
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
|
||||
@ -1675,26 +1827,45 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
|
||||
/* Allocate WQE */
|
||||
if(OPAL_UNLIKELY(qp_get_wqe(ep, qp) < 0)) {
|
||||
goto cant_send_wqe;
|
||||
goto no_credits_or_wqe;
|
||||
}
|
||||
|
||||
/* eager rdma or send ? Check eager rdma credits */
|
||||
/* Note: Maybe we want to implement isend only for eager rdma ?*/
|
||||
eager_limit = mca_btl_openib_component.eager_limit +
|
||||
sizeof(mca_btl_openib_header_coalesced_t) +
|
||||
sizeof(mca_btl_openib_control_header_t);
|
||||
|
||||
if(OPAL_LIKELY(size <= eager_limit)) {
|
||||
if(acquire_eager_rdma_send_credit(ep) == OPAL_SUCCESS) {
|
||||
do_rdma = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* if(!do_rdma && acquire_send_credit(ep, frag) != OPAL_SUCCESS) { */
|
||||
/* Check send credits if it is no rdma */
|
||||
if(!do_rdma) {
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, -1) < 0)){
|
||||
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
|
||||
goto no_credits_or_wqe;
|
||||
}
|
||||
} else {
|
||||
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, -1) < 0)){
|
||||
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
|
||||
goto no_credits_or_wqe;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Allocate fragment */
|
||||
OMPI_FREE_LIST_GET_MT(&obtl->device->qps[qp].send_free, item);
|
||||
if(OPAL_UNLIKELY(NULL == item)) {
|
||||
/* we don't return NULL because maybe later we will try to coalesce */
|
||||
goto cant_send_wqe;
|
||||
goto no_frags;
|
||||
}
|
||||
frag = to_base_frag(item);
|
||||
hdr = to_send_frag(item)->hdr;
|
||||
|
||||
/* eager rdma or send ? Check eager rdma credits */
|
||||
/* Note: Maybe we want to implement isend only for eager rdma ?*/
|
||||
rc = mca_btl_openib_endpoint_credit_acquire (ep, qp, prio, size, &do_rdma,
|
||||
to_send_frag(frag), false);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
goto cant_send_frag;
|
||||
}
|
||||
|
||||
frag->segment.base.seg_len = size;
|
||||
frag->base.order = qp;
|
||||
frag->base.des_flags = flags;
|
||||
@ -1719,6 +1890,29 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
assert(max_data == payload_size);
|
||||
}
|
||||
|
||||
/* Set all credits */
|
||||
BTL_OPENIB_GET_CREDITS(ep->eager_rdma_local.credits, hdr->credits);
|
||||
if(hdr->credits)
|
||||
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
|
||||
|
||||
if(!do_rdma) {
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
|
||||
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.rd_credits, hdr->credits);
|
||||
}
|
||||
} else {
|
||||
hdr->credits |= (qp << 11);
|
||||
}
|
||||
|
||||
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.cm_return, cm_return);
|
||||
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
|
||||
if(cm_return > 255) {
|
||||
hdr->cm_seen = 255;
|
||||
cm_return -= 255;
|
||||
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.cm_return, cm_return);
|
||||
} else {
|
||||
hdr->cm_seen = cm_return;
|
||||
}
|
||||
|
||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
||||
send_signaled = 1;
|
||||
#else
|
||||
@ -1726,7 +1920,7 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
#endif
|
||||
ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled);
|
||||
|
||||
if (!ib_rc) {
|
||||
if(!ib_rc) {
|
||||
if (0 == send_signaled) {
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
}
|
||||
@ -1737,28 +1931,37 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
#endif
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Failed to send, do clean up all allocated resources */
|
||||
if (ep->nbo) {
|
||||
if(ep->nbo) {
|
||||
BTL_OPENIB_HEADER_NTOH(*hdr);
|
||||
}
|
||||
|
||||
mca_btl_openib_endpoint_credit_release (ep, qp, do_rdma, to_send_frag(frag));
|
||||
|
||||
cant_send_frag:
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
cant_send_wqe:
|
||||
qp_put_wqe (ep, qp);
|
||||
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
|
||||
OPAL_THREAD_ADD32(&ep->eager_rdma_local.credits,
|
||||
BTL_OPENIB_CREDITS(hdr->credits));
|
||||
}
|
||||
if (!do_rdma && BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.rd_credits,
|
||||
hdr->credits);
|
||||
}
|
||||
no_frags:
|
||||
if(do_rdma) {
|
||||
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, 1);
|
||||
} else {
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
|
||||
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
|
||||
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
|
||||
}
|
||||
}
|
||||
no_credits_or_wqe:
|
||||
qp_put_wqe(ep, qp);
|
||||
cant_send:
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
/* We can not send the data directly, so we just return descriptor */
|
||||
if (NULL != descriptor) {
|
||||
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
|
||||
}
|
||||
|
||||
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
|
||||
return OPAL_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
/*
|
||||
@ -1778,19 +1981,11 @@ int mca_btl_openib_send(
|
||||
openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED);
|
||||
|
||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) {
|
||||
frag = to_coalesced_frag(des)->send_frag;
|
||||
|
||||
/* save coalesced fragment on a main fragment; we will need it after send
|
||||
* completion to free it and to call upper layer callback */
|
||||
opal_list_append(&frag->coalesced_frags, (opal_list_item_t*) des);
|
||||
frag->coalesced_length += to_coalesced_frag(des)->hdr->alloc_size +
|
||||
sizeof(mca_btl_openib_header_coalesced_t);
|
||||
|
||||
to_coalesced_frag(des)->sent = true;
|
||||
to_coalesced_frag(des)->hdr->tag = tag;
|
||||
to_coalesced_frag(des)->hdr->size = des->des_segments->seg_len;
|
||||
to_coalesced_frag(des)->hdr->size = des->des_local->seg_len;
|
||||
if(ep->nbo)
|
||||
BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr);
|
||||
frag = to_coalesced_frag(des)->send_frag;
|
||||
} else {
|
||||
frag = to_send_frag(des);
|
||||
to_com_frag(des)->endpoint = ep;
|
||||
@ -1802,34 +1997,161 @@ int mca_btl_openib_send(
|
||||
return mca_btl_openib_endpoint_send(ep, frag);
|
||||
}
|
||||
|
||||
static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags)
|
||||
/*
|
||||
* RDMA WRITE local buffer to remote buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_put( mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* ep,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
{
|
||||
mca_btl_openib_reg_t *reg;
|
||||
uint32_t mflags = 0;
|
||||
int rc;
|
||||
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_local;
|
||||
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_remote;
|
||||
struct ibv_send_wr* bad_wr;
|
||||
mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor);
|
||||
int qp = descriptor->order;
|
||||
uint64_t rem_addr = dst_seg->base.seg_addr.lval;
|
||||
uint32_t rkey = dst_seg->key;
|
||||
|
||||
#if OPAL_CUDA_GDR_SUPPORT
|
||||
if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) {
|
||||
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
|
||||
}
|
||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER ||
|
||||
openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND);
|
||||
|
||||
rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags,
|
||||
(mca_mpool_base_registration_t **) ®);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) {
|
||||
return NULL;
|
||||
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
int rc;
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OPAL_ERR_RESOURCE_BUSY == rc)
|
||||
return OPAL_SUCCESS;
|
||||
if(OPAL_SUCCESS != rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ®->btl_handle;
|
||||
if(MCA_BTL_NO_ORDER == qp)
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
/* post descriptor */
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
rem_addr = opal_swap_bytes8(rem_addr);
|
||||
rkey = opal_swap_bytes4(rkey);
|
||||
}
|
||||
#endif
|
||||
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
|
||||
frag->sr_desc.wr.rdma.rkey = rkey;
|
||||
|
||||
to_com_frag(frag)->sg_entry.addr = src_seg->base.seg_addr.lval;
|
||||
to_com_frag(frag)->sg_entry.length = src_seg->base.seg_len;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
|
||||
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
|
||||
descriptor->order = qp;
|
||||
/* Setting opcode on a frag constructor isn't enough since prepare_src
|
||||
* may return send_frag instead of put_frag */
|
||||
frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
|
||||
frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1);
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
|
||||
return OPAL_ERROR;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle));
|
||||
/*
|
||||
* RDMA READ remote buffer to local buffer address.
|
||||
*/
|
||||
|
||||
btl->btl_mpool->mpool_deregister (btl->btl_mpool, (mca_mpool_base_registration_t *) reg);
|
||||
int mca_btl_openib_get(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* ep,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
{
|
||||
mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_remote;
|
||||
mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_local;
|
||||
struct ibv_send_wr* bad_wr;
|
||||
mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor);
|
||||
int qp = descriptor->order;
|
||||
uint64_t rem_addr = src_seg->base.seg_addr.lval;
|
||||
uint32_t rkey = src_seg->key;
|
||||
|
||||
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER);
|
||||
|
||||
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
int rc;
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OPAL_ERR_RESOURCE_BUSY == rc)
|
||||
return OPAL_SUCCESS;
|
||||
if(OPAL_SUCCESS != rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if(MCA_BTL_NO_ORDER == qp)
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* check for a get token */
|
||||
if(OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_ADD32(&ep->get_tokens,1);
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
rem_addr = opal_swap_bytes8(rem_addr);
|
||||
rkey = opal_swap_bytes4(rkey);
|
||||
}
|
||||
#endif
|
||||
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
|
||||
frag->sr_desc.wr.rdma.rkey = rkey;
|
||||
|
||||
to_com_frag(frag)->sg_entry.addr = dst_seg->base.seg_addr.lval;
|
||||
to_com_frag(frag)->sg_entry.length = dst_seg->base.seg_len;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
|
||||
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
descriptor->order = qp;
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr))
|
||||
return OPAL_ERROR;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -126,7 +126,10 @@ struct mca_btl_openib_qp_info_t {
|
||||
(BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP)
|
||||
|
||||
typedef enum {
|
||||
BTL_OPENIB_RQ_SOURCE_DEVICE_INI = MCA_BASE_VAR_SOURCE_MAX,
|
||||
BTL_OPENIB_RQ_SOURCE_DEFAULT,
|
||||
BTL_OPENIB_RQ_SOURCE_MCA,
|
||||
BTL_OPENIB_RQ_SOURCE_DEVICE_INI,
|
||||
BTL_OPENIB_RQ_SOURCE_MAX
|
||||
} btl_openib_receive_queues_source_t;
|
||||
|
||||
typedef enum {
|
||||
@ -494,15 +497,9 @@ typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
|
||||
|
||||
extern mca_btl_openib_module_t mca_btl_openib_module;
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
uint32_t rkey;
|
||||
uint32_t lkey;
|
||||
};
|
||||
|
||||
struct mca_btl_openib_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
struct ibv_mr *mr;
|
||||
mca_btl_base_registration_handle_t btl_handle;
|
||||
};
|
||||
typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t;
|
||||
|
||||
@ -615,182 +612,32 @@ extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t** descriptor
|
||||
);
|
||||
|
||||
/* forward decaration for internal put/get */
|
||||
struct mca_btl_openib_put_frag_t;
|
||||
struct mca_btl_openib_get_frag_t;
|
||||
|
||||
/**
|
||||
* @brief Schedule a put fragment with the HCA (internal)
|
||||
* PML->BTL Initiate a put of the specified size.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param frag (IN) Fragment prepared by mca_btl_openib_put
|
||||
*
|
||||
* If the fragment can not be scheduled due to resource limitations then
|
||||
* the fragment will be put on the pending put fragment list and retried
|
||||
* when another get/put fragment has completed.
|
||||
* @param btl_peer (IN) BTL peer addressing
|
||||
* @param descriptor (IN) Descriptor of data to be transmitted.
|
||||
*/
|
||||
int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
struct mca_btl_openib_put_frag_t *frag);
|
||||
extern int mca_btl_openib_put(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Schedule an RDMA write with the HCA
|
||||
* PML->BTL Initiate a get of the specified size.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param local_address (IN) Source address
|
||||
* @param remote_address (IN) Destination address
|
||||
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
|
||||
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
|
||||
* @param size (IN) Number of bytes to write
|
||||
* @param flags (IN) Transfer flags
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion
|
||||
* @param cbcontext (IN) Context for completion callback
|
||||
* @param cbdata (IN) Data for completion callback
|
||||
*
|
||||
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
|
||||
* @return OPAL_SUCCCESS if the operation was successfully scheduled
|
||||
*
|
||||
* This function will attempt to schedule a put operation with the HCA.
|
||||
* @param btl_base_peer (IN) BTL peer addressing
|
||||
* @param descriptor (IN) Descriptor of data to be transmitted.
|
||||
*/
|
||||
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
extern int mca_btl_openib_get(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* descriptor
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Schedule a get fragment with the HCA (internal)
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param qp (IN) ID of queue pair to schedule the get on
|
||||
* @param frag (IN) Fragment prepared by mca_btl_openib_get
|
||||
*
|
||||
* If the fragment can not be scheduled due to resource limitations then
|
||||
* the fragment will be put on the pending get fragment list and retried
|
||||
* when another get/put fragment has completed.
|
||||
*/
|
||||
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
struct mca_btl_openib_get_frag_t *frag);
|
||||
|
||||
/**
|
||||
* @brief Schedule an RDMA read with the HCA
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param ep (IN) BTL endpoint
|
||||
* @param local_address (IN) Destination address
|
||||
* @param remote_address (IN) Source address
|
||||
* @param local_handle (IN) Registration handle for region containing the region {local_address, size}
|
||||
* @param remote_handle (IN) Registration handle for region containing the region {remote_address, size}
|
||||
* @param size (IN) Number of bytes to read
|
||||
* @param flags (IN) Transfer flags
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion
|
||||
* @param cbcontext (IN) Context for completion callback
|
||||
* @param cbdata (IN) Data for completion callback
|
||||
*
|
||||
* @return OPAL_ERR_BAD_PARAM if a bad parameter was passed
|
||||
* @return OPAL_SUCCCESS if the operation was successfully scheduled
|
||||
*
|
||||
* This function will attempt to schedule a get operation with the HCA.
|
||||
*/
|
||||
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous fetching atomic operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param op (IN) Operation to perform
|
||||
* @param operand (IN) Operand for the operation
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
int64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous compare and swap operation.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the atomic operation has been queued with the
|
||||
* network.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (OUT) Local address to store the result in
|
||||
* @param remote_address (IN) Remote address perfom operation on to (registered remotely)
|
||||
* @param local_handle (IN) Local registration handle for region containing
|
||||
* (local_address, local_address + 8)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + 8)
|
||||
* @param compare (IN) Operand for the operation
|
||||
* @param value (IN) Value to store on success
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The operation was successfully queued
|
||||
* @retval 1 The operation is complete
|
||||
* @retval OPAL_ERROR The operation was NOT successfully queued
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to
|
||||
* alignment restrictions or the operation {op} is not supported
|
||||
* by the hardware.
|
||||
*
|
||||
* After the operation is complete the remote address specified by {remote_address} and
|
||||
* {remote_handle} will be updated with {value} if *remote_address == compare.
|
||||
* {local_address} will be updated with the previous value stored in {remote_address}.
|
||||
* The btl will guarantee consistency of atomic operations performed via the btl. Note,
|
||||
* however, that not all btls will provide consistency between btl atomic operations and
|
||||
* cpu atomics.
|
||||
*/
|
||||
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, int64_t compare,
|
||||
int64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Allocate a descriptor.
|
||||
@ -827,6 +674,7 @@ extern int mca_btl_openib_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -834,6 +682,22 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
/**
|
||||
* Allocate a descriptor initialized for RDMA write.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
|
||||
extern void mca_btl_openib_frag_progress_pending_put_get(
|
||||
struct mca_btl_base_endpoint_t*, const int);
|
||||
|
||||
|
@ -1,135 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_xrc.h"
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
|
||||
static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, enum ibv_wr_opcode opcode,
|
||||
int64_t operand, int operand2, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_openib_get_frag_t* frag = NULL;
|
||||
int qp = order;
|
||||
int rc;
|
||||
|
||||
frag = to_get_frag(alloc_recv_user_frag());
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (MCA_BTL_NO_ORDER == qp) {
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
}
|
||||
|
||||
/* set base descriptor flags */
|
||||
to_base_frag(frag)->base.order = qp;
|
||||
/* free this descriptor when the operation is complete */
|
||||
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
|
||||
/* set up scatter-gather entry */
|
||||
to_com_frag(frag)->sg_entry.length = 8;
|
||||
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
|
||||
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
|
||||
to_com_frag(frag)->endpoint = endpoint;
|
||||
|
||||
/* set up rdma callback */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* set up descriptor */
|
||||
frag->sr_desc.wr.atomic.remote_addr = remote_address;
|
||||
frag->sr_desc.opcode = opcode;
|
||||
frag->sr_desc.wr.atomic.compare_add = operand;
|
||||
frag->sr_desc.wr.atomic.swap = operand2;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
frag->sr_desc.wr.atomic.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
frag->sr_desc.xrc_remote_srq_num=endpoint->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
|
||||
rc = check_endpoint_state(endpoint, &to_base_frag(frag)->base, &endpoint->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
if (OPAL_ERR_RESOURCE_BUSY == rc) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_get_internal (btl, endpoint, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
rc = OPAL_SUCCESS;
|
||||
|
||||
OPAL_THREAD_SCOPED_LOCK(&endpoint->endpoint_lock,
|
||||
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag));
|
||||
} else {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
int64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
|
||||
if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op)) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, IBV_WR_ATOMIC_FETCH_AND_ADD, operand, 0,
|
||||
flags, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address,
|
||||
struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, int64_t compare,
|
||||
int64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value,
|
||||
flags, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
#endif
|
@ -471,7 +471,7 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
mca_btl_openib_header_coalesced_t *clsc_hdr =
|
||||
(mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1);
|
||||
mca_btl_active_message_callback_t* reg;
|
||||
size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr);
|
||||
size_t len = des->des_local->seg_len - sizeof(*ctl_hdr);
|
||||
|
||||
switch (ctl_hdr->type) {
|
||||
case MCA_BTL_OPENIB_CONTROL_CREDITS:
|
||||
@ -522,8 +522,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
|
||||
skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad);
|
||||
|
||||
tmp_des.des_segments = &tmp_seg;
|
||||
tmp_des.des_segment_count = 1;
|
||||
tmp_des.des_local = &tmp_seg;
|
||||
tmp_des.des_local_count = 1;
|
||||
tmp_seg.seg_addr.pval = clsc_hdr + 1;
|
||||
tmp_seg.seg_len = clsc_hdr->size;
|
||||
|
||||
@ -583,10 +583,6 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
||||
enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE |
|
||||
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
access_flag |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
|
||||
if (device->mem_reg_max &&
|
||||
device->mem_reg_max < (device->mem_reg_active + size)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
@ -609,9 +605,6 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
openib_reg->btl_handle.lkey = openib_reg->mr->lkey;
|
||||
openib_reg->btl_handle.rkey = openib_reg->mr->rkey;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
|
||||
"openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
|
||||
(int) (reg->bound - reg->base + 1), reg->flags));
|
||||
@ -811,30 +804,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
|
||||
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
|
||||
|
||||
if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz;
|
||||
}
|
||||
|
||||
openib_btl->super.btl_get_alignment = 0;
|
||||
|
||||
if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz;
|
||||
}
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) {
|
||||
openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
|
||||
openib_btl->super.btl_atomic_flags = 0;
|
||||
openib_btl->super.btl_atomic_fop = NULL;
|
||||
openib_btl->super.btl_atomic_cswap = NULL;
|
||||
} else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) {
|
||||
openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
|
||||
}
|
||||
#endif
|
||||
|
||||
openib_btl->super.btl_put_alignment = 0;
|
||||
|
||||
openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
openib_btl->super.btl_seg_size = sizeof (mca_btl_openib_segment_t);
|
||||
|
||||
/* Check bandwidth configured for this device */
|
||||
sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev));
|
||||
@ -1990,7 +1960,9 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
}
|
||||
|
||||
/* If the MCA param was specified, skip all the checks */
|
||||
if (MCA_BASE_VAR_SOURCE_DEFAULT != mca_btl_openib_component.receive_queues_source) {
|
||||
if ( MCA_BASE_VAR_SOURCE_COMMAND_LINE ||
|
||||
MCA_BASE_VAR_SOURCE_ENV ==
|
||||
mca_btl_openib_component.receive_queues_source) {
|
||||
goto good;
|
||||
}
|
||||
|
||||
@ -2008,7 +1980,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
mca_btl_openib_component.receive_queues =
|
||||
strdup(values.receive_queues);
|
||||
mca_btl_openib_component.receive_queues_source =
|
||||
BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
|
||||
MCA_BASE_VAR_SOURCE_FILE;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2909,20 +2881,17 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
size_t i, len = opal_list_get_size(&ep->pending_get_frags);
|
||||
int rc;
|
||||
|
||||
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) {
|
||||
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++)
|
||||
{
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
frag = opal_list_remove_first(&(ep->pending_get_frags));
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (NULL == frag)
|
||||
if(NULL == frag)
|
||||
break;
|
||||
rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep,
|
||||
to_get_frag(frag));
|
||||
if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_prepend (&ep->pending_get_frags, frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OPAL_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
len = opal_list_get_size(&ep->pending_put_frags);
|
||||
@ -2930,16 +2899,12 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
frag = opal_list_remove_first(&(ep->pending_put_frags));
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (NULL == frag)
|
||||
if(NULL == frag)
|
||||
break;
|
||||
rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep,
|
||||
to_put_frag(frag));
|
||||
if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_prepend (&ep->pending_put_frags, frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OPAL_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -2960,7 +2925,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
||||
|
||||
/* advance the segment address past the header and subtract from the
|
||||
* length.*/
|
||||
des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
|
||||
des->des_local->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
|
||||
|
||||
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
|
||||
/* call registered callback */
|
||||
@ -2995,7 +2960,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
||||
}
|
||||
} else {
|
||||
mca_btl_openib_rdma_credits_header_t *chdr =
|
||||
(mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval;
|
||||
(mca_btl_openib_rdma_credits_header_t *) des->des_local->seg_addr.pval;
|
||||
if(ep->nbo) {
|
||||
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
|
||||
}
|
||||
@ -3301,27 +3266,11 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
/* Handle work completions */
|
||||
switch(wc->opcode) {
|
||||
case IBV_WC_RDMA_READ:
|
||||
case IBV_WC_COMP_SWAP:
|
||||
case IBV_WC_FETCH_ADD:
|
||||
OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE"));
|
||||
|
||||
OPAL_OUTPUT((-1, "Got WC: RDMA_READ"));
|
||||
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
||||
|
||||
mca_btl_openib_get_frag_t *get_frag = to_get_frag(des);
|
||||
|
||||
get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
|
||||
get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data,
|
||||
OPAL_SUCCESS);
|
||||
case IBV_WC_RDMA_WRITE:
|
||||
if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) {
|
||||
mca_btl_openib_put_frag_t *put_frag = to_put_frag(des);
|
||||
|
||||
put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
|
||||
put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data,
|
||||
OPAL_SUCCESS);
|
||||
put_frag->cb.func = NULL;
|
||||
}
|
||||
/* fall through */
|
||||
|
||||
case IBV_WC_RDMA_WRITE:
|
||||
case IBV_WC_SEND:
|
||||
OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND"));
|
||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
||||
@ -3350,7 +3299,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
||||
/* Process a completed send/put/get */
|
||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||||
des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS);
|
||||
des->des_cbfunc(&openib_btl->super, endpoint, des,OPAL_SUCCESS);
|
||||
}
|
||||
if( btl_ownership ) {
|
||||
mca_btl_openib_free(&openib_btl->super, des);
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
|
||||
@ -51,7 +51,7 @@
|
||||
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
|
||||
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
|
||||
|
||||
static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep,
|
||||
static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
|
||||
mca_btl_openib_send_frag_t *frag)
|
||||
{
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
@ -67,34 +67,91 @@ static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep,
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
|
||||
mca_btl_openib_send_frag_t *frag)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
|
||||
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
|
||||
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
|
||||
opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
|
||||
(opal_list_item_t *)frag);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
} else {
|
||||
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0)
|
||||
{
|
||||
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
||||
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
|
||||
opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
|
||||
(opal_list_item_t *)frag);
|
||||
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* this function is called with endpoint->endpoint_lock held */
|
||||
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
|
||||
mca_btl_openib_send_frag_t *frag)
|
||||
{
|
||||
int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
|
||||
mca_btl_openib_header_t *hdr = frag->hdr;
|
||||
mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
|
||||
int qp, ib_rc, rc;
|
||||
int qp, ib_rc;
|
||||
int32_t cm_return;
|
||||
bool do_rdma = false;
|
||||
size_t size;
|
||||
size_t eager_limit;
|
||||
|
||||
if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
|
||||
des->order = frag->qp_idx;
|
||||
|
||||
qp = des->order;
|
||||
|
||||
if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS)
|
||||
if(acruire_wqe(endpoint, frag) != OPAL_SUCCESS)
|
||||
return OPAL_ERR_RESOURCE_BUSY;
|
||||
|
||||
size = des->des_segments->seg_len + frag->coalesced_length;
|
||||
eager_limit = mca_btl_openib_component.eager_limit +
|
||||
sizeof(mca_btl_openib_header_coalesced_t) +
|
||||
sizeof(mca_btl_openib_control_header_t);
|
||||
if(des->des_local->seg_len + frag->coalesced_length <= eager_limit &&
|
||||
(des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) {
|
||||
/* High priority frag. Try to send over eager RDMA */
|
||||
if(acquire_eager_rdma_send_credit(endpoint) == OPAL_SUCCESS)
|
||||
do_rdma = true;
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size,
|
||||
&do_rdma, frag, true);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if(!do_rdma && acquire_send_credit(endpoint, frag) != OPAL_SUCCESS) {
|
||||
qp_put_wqe(endpoint, qp);
|
||||
return OPAL_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
|
||||
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
|
||||
if(hdr->credits)
|
||||
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
|
||||
|
||||
if(!do_rdma) {
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
|
||||
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
|
||||
}
|
||||
} else {
|
||||
hdr->credits |= (qp << 11);
|
||||
}
|
||||
|
||||
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
|
||||
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
|
||||
if(cm_return > 255) {
|
||||
hdr->cm_seen = 255;
|
||||
cm_return -= 255;
|
||||
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
|
||||
} else {
|
||||
hdr->cm_seen = cm_return;
|
||||
}
|
||||
|
||||
qp_reset_signal_count(endpoint, qp);
|
||||
ib_rc = post_send(endpoint, frag, do_rdma, 1);
|
||||
|
||||
@ -104,12 +161,27 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
|
||||
if(endpoint->nbo)
|
||||
BTL_OPENIB_HEADER_NTOH(*hdr);
|
||||
|
||||
mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag);
|
||||
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
|
||||
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
|
||||
BTL_OPENIB_CREDITS(hdr->credits));
|
||||
}
|
||||
|
||||
qp_put_wqe(endpoint, qp);
|
||||
|
||||
BTL_ERROR(("error posting send request error %d: %s. size = %lu\n",
|
||||
ib_rc, strerror(ib_rc), size));
|
||||
if(do_rdma) {
|
||||
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
|
||||
} else {
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
|
||||
hdr->credits);
|
||||
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
|
||||
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
|
||||
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
|
||||
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
||||
}
|
||||
}
|
||||
BTL_ERROR(("error posting send request error %d: %s\n",
|
||||
ib_rc, strerror(ib_rc)));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
@ -618,8 +690,8 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
|
||||
/* We need to post this one */
|
||||
|
||||
if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) {
|
||||
BTL_ERROR(("Error posting send"));
|
||||
}
|
||||
BTL_ERROR(("Error posting send"));
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -11,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||
@ -611,101 +610,6 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
|
||||
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
|
||||
}
|
||||
|
||||
/* called with the endpoint lock held */
|
||||
static inline int mca_btl_openib_endpoint_credit_acquire (struct mca_btl_base_endpoint_t *endpoint, int qp,
|
||||
int prio, size_t size, bool *do_rdma,
|
||||
mca_btl_openib_send_frag_t *frag, bool queue_frag)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
|
||||
mca_btl_openib_header_t *hdr = frag->hdr;
|
||||
size_t eager_limit;
|
||||
int32_t cm_return;
|
||||
|
||||
eager_limit = mca_btl_openib_component.eager_limit +
|
||||
sizeof(mca_btl_openib_header_coalesced_t) +
|
||||
sizeof(mca_btl_openib_control_header_t);
|
||||
|
||||
if (!(prio && size < eager_limit && acquire_eager_rdma_send_credit(endpoint) == OPAL_SUCCESS)) {
|
||||
*do_rdma = false;
|
||||
|
||||
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
if (OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
|
||||
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
|
||||
if (queue_frag) {
|
||||
opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
|
||||
(opal_list_item_t *)frag);
|
||||
}
|
||||
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
} else {
|
||||
if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) {
|
||||
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
||||
if (queue_frag) {
|
||||
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
|
||||
opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
|
||||
(opal_list_item_t *)frag);
|
||||
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
||||
}
|
||||
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* High priority frag. Try to send over eager RDMA */
|
||||
*do_rdma = true;
|
||||
}
|
||||
|
||||
/* Set all credits */
|
||||
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
|
||||
if (hdr->credits) {
|
||||
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
|
||||
}
|
||||
|
||||
if (!*do_rdma) {
|
||||
if (BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
|
||||
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
|
||||
}
|
||||
} else {
|
||||
hdr->credits |= (qp << 11);
|
||||
}
|
||||
|
||||
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
|
||||
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
|
||||
if(cm_return > 255) {
|
||||
hdr->cm_seen = 255;
|
||||
cm_return -= 255;
|
||||
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
|
||||
} else {
|
||||
hdr->cm_seen = cm_return;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* called with the endpoint lock held. */
|
||||
static inline void mca_btl_openib_endpoint_credit_release (struct mca_btl_base_endpoint_t *endpoint, int qp,
|
||||
bool do_rdma, mca_btl_openib_send_frag_t *frag)
|
||||
{
|
||||
mca_btl_openib_header_t *hdr = frag->hdr;
|
||||
|
||||
if (BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
|
||||
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_OPENIB_CREDITS(hdr->credits));
|
||||
}
|
||||
|
||||
if (do_rdma) {
|
||||
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
|
||||
} else {
|
||||
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
OPAL_THREAD_ADD32 (&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
|
||||
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
|
||||
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
|
||||
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
|
||||
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -153,8 +153,8 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||
if (NULL != btlname) free(btlname);
|
||||
|
||||
/* Since we believe we have done a send, read or write, then the
|
||||
* des_segments fields should have valid data. */
|
||||
assert(des->des_segments != NULL);
|
||||
* des_local fields should have valid data. */
|
||||
assert(des->des_local != NULL);
|
||||
|
||||
/* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
|
||||
* change the status. Since this connection was mapped out in the
|
||||
|
@ -68,8 +68,8 @@ static void out_constructor(mca_btl_openib_out_frag_t *frag)
|
||||
{
|
||||
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
|
||||
|
||||
base_frag->base.des_segments = &base_frag->segment.base;
|
||||
base_frag->base.des_segment_count = 1;
|
||||
base_frag->base.des_local = &base_frag->segment.base;
|
||||
base_frag->base.des_local_count = 1;
|
||||
|
||||
frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
|
||||
frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
|
||||
@ -83,8 +83,8 @@ static void in_constructor(mca_btl_openib_in_frag_t *frag)
|
||||
{
|
||||
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
|
||||
|
||||
base_frag->base.des_segments = &base_frag->segment.base;
|
||||
base_frag->base.des_segment_count = 1;
|
||||
base_frag->base.des_local = &base_frag->segment.base;
|
||||
base_frag->base.des_local_count = 1;
|
||||
}
|
||||
|
||||
static void send_constructor(mca_btl_openib_send_frag_t *frag)
|
||||
@ -134,7 +134,6 @@ static void put_constructor(mca_btl_openib_put_frag_t *frag)
|
||||
{
|
||||
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
|
||||
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
|
||||
frag->cb.func = NULL;
|
||||
}
|
||||
|
||||
static void get_constructor(mca_btl_openib_get_frag_t *frag)
|
||||
@ -155,8 +154,8 @@ static void coalesced_constructor(mca_btl_openib_coalesced_frag_t *frag)
|
||||
|
||||
base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED;
|
||||
|
||||
base_frag->base.des_segments = &base_frag->segment.base;
|
||||
base_frag->base.des_segment_count = 1;
|
||||
base_frag->base.des_local = &base_frag->segment.base;
|
||||
base_frag->base.des_local_count = 1;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
|
@ -349,15 +349,7 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
|
||||
|
||||
#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
|
||||
|
||||
typedef struct mca_btl_openib_put_frag_t {
|
||||
mca_btl_openib_out_frag_t super;
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
void *context;
|
||||
void *data;
|
||||
} cb;
|
||||
} mca_btl_openib_put_frag_t;
|
||||
typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
|
||||
|
||||
#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
|
||||
@ -365,12 +357,6 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
|
||||
typedef struct mca_btl_openib_get_frag_t {
|
||||
mca_btl_openib_in_frag_t super;
|
||||
struct ibv_send_wr sr_desc;
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
void *context;
|
||||
void *data;
|
||||
} cb;
|
||||
} mca_btl_openib_get_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
|
||||
|
||||
@ -385,7 +371,6 @@ typedef struct mca_btl_openib_coalesced_frag_t {
|
||||
mca_btl_openib_frag_t super;
|
||||
mca_btl_openib_send_frag_t *send_frag;
|
||||
mca_btl_openib_header_coalesced_t *hdr;
|
||||
bool sent;
|
||||
} mca_btl_openib_coalesced_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);
|
||||
|
||||
|
@ -1,159 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_frag.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_xrc.h"
|
||||
|
||||
/*
|
||||
* RDMA READ remote buffer to local buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_openib_get_frag_t* frag = NULL;
|
||||
int qp = order;
|
||||
int rc;
|
||||
|
||||
if (OPAL_UNLIKELY(size > btl->btl_get_limit)) {
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
frag = to_get_frag(alloc_recv_user_frag());
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (MCA_BTL_NO_ORDER == qp) {
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
}
|
||||
|
||||
/* set base descriptor flags */
|
||||
to_base_frag(frag)->base.order = qp;
|
||||
/* free this descriptor when the operation is complete */
|
||||
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
|
||||
/* set up scatter-gather entry */
|
||||
to_com_frag(frag)->sg_entry.length = size;
|
||||
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
|
||||
to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
|
||||
/* set up rdma callback */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* set up descriptor */
|
||||
frag->sr_desc.wr.rdma.remote_addr = remote_address;
|
||||
/* the opcode may have been changed by an atomic operation */
|
||||
frag->sr_desc.opcode = IBV_WR_RDMA_READ;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (OPAL_ERR_RESOURCE_BUSY == rc) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_get_internal (btl, ep, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
rc = OPAL_SUCCESS;
|
||||
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
} else {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_openib_get_frag_t *frag)
|
||||
{
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
struct ibv_send_wr *bad_wr;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* check for a get token */
|
||||
if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_ADD32(&ep->get_tokens,1);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_ADD32(&ep->get_tokens,1);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -12,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -568,16 +567,10 @@ int btl_openib_register_mca_params(void)
|
||||
mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024;
|
||||
mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024;
|
||||
mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
|
||||
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT;
|
||||
#endif
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
|
||||
mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
|
||||
#endif
|
||||
|
||||
/* Default to bandwidth auto-detection */
|
||||
mca_btl_openib_module.super.btl_bandwidth = 0;
|
||||
mca_btl_openib_module.super.btl_latency = 4;
|
||||
|
@ -1,152 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_frag.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_xrc.h"
|
||||
|
||||
/*
|
||||
* RDMA WRITE local buffer to remote buffer address.
|
||||
*/
|
||||
|
||||
int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_openib_put_frag_t *frag = NULL;
|
||||
int rc, qp = order;
|
||||
|
||||
if (OPAL_UNLIKELY(size > btl->btl_put_limit)) {
|
||||
return OPAL_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
frag = to_put_frag(alloc_send_user_frag ());
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (MCA_BTL_NO_ORDER == qp) {
|
||||
qp = mca_btl_openib_component.rdma_qp;
|
||||
}
|
||||
|
||||
/* set base descriptor flags */
|
||||
to_base_frag(frag)->base.order = qp;
|
||||
/* free this descriptor when the operation is complete */
|
||||
to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
|
||||
/* set up scatter-gather entry */
|
||||
to_com_frag(frag)->sg_entry.length = size;
|
||||
to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
|
||||
to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address;
|
||||
to_com_frag(frag)->endpoint = ep;
|
||||
|
||||
/* set up rdma callback */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* post descriptor */
|
||||
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
|
||||
to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1);
|
||||
to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address;
|
||||
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
|
||||
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
|
||||
to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp))
|
||||
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
|
||||
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (OPAL_ERR_RESOURCE_BUSY == rc) {
|
||||
/* descriptor was queued pending connection */
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
rc = mca_btl_openib_put_internal (btl, ep, frag);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
|
||||
rc = OPAL_SUCCESS;
|
||||
|
||||
/* queue the fragment for when resources are available */
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
} else {
|
||||
MCA_BTL_IB_FRAG_RETURN (frag);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_openib_put_frag_t *frag)
|
||||
{
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
struct ibv_send_wr *bad_wr;
|
||||
int rc;
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
|
||||
qp_reset_signal_count(ep, qp);
|
||||
|
||||
if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) {
|
||||
qp_put_wqe(ep, qp);
|
||||
return OPAL_ERROR;;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
@ -126,6 +126,7 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[
|
||||
[enable openib BTL failover])
|
||||
AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"])
|
||||
|
||||
|
||||
# Check for __malloc_hook availability
|
||||
AC_ARG_ENABLE(btl-openib-malloc-alignment,
|
||||
AC_HELP_STRING([--enable-btl-openib-malloc-alignment], [Enable support for allocated memory alignment. Default: enabled if supported, disabled otherwise.]))
|
||||
|
@ -321,17 +321,16 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
|
||||
|
||||
static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg);
|
||||
static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep);
|
||||
static int udcm_rc_qps_to_rts(mca_btl_openib_endpoint_t *lcl_ep);
|
||||
|
||||
/* XRC support */
|
||||
#if HAVE_XRC
|
||||
static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc,
|
||||
mca_btl_base_endpoint_t *lcl_ep);
|
||||
static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep);
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr);
|
||||
static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep);
|
||||
static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep);
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn);
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr);
|
||||
static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
|
||||
uint8_t msg_type);
|
||||
static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep,
|
||||
@ -508,93 +507,6 @@ static int udcm_component_finalize(void)
|
||||
|
||||
/* mark: udcm module */
|
||||
|
||||
#if HAVE_XRC
|
||||
static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
{
|
||||
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
|
||||
int rc;
|
||||
|
||||
opal_mutex_lock (&udep->udep_lock);
|
||||
do {
|
||||
rc = udcm_xrc_recv_qp_connect (lcl_ep);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error connecting loopback XRC receive queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
lcl_ep->xrc_recv_qp_num = lcl_ep->qps[0].qp->lcl_qp->qp_num;
|
||||
|
||||
rc = mca_btl_openib_endpoint_post_recvs (lcl_ep);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error posting receives for loopback queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
|
||||
lcl_ep->qps[0].qp->lcl_psn);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error creating loopback XRC receive queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
rc = udcm_xrc_send_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num,
|
||||
lcl_ep->qps[0].qp->lcl_psn);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("error creating loopback XRC send queue pair"));
|
||||
break;
|
||||
}
|
||||
|
||||
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
|
||||
|
||||
rc = udcm_finish_connection (lcl_ep);
|
||||
} while (0);
|
||||
opal_mutex_unlock (&udep->udep_lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
{
|
||||
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
|
||||
int rc;
|
||||
|
||||
opal_mutex_lock (&udep->udep_lock);
|
||||
do {
|
||||
if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) {
|
||||
BTL_VERBOSE(("error initializing loopback endpoint cpc data"));
|
||||
break;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) {
|
||||
BTL_VERBOSE(("error initializing loopback endpoint qps"));
|
||||
break;
|
||||
}
|
||||
|
||||
/* save queue pair info */
|
||||
lcl_ep->rem_info.rem_index = lcl_ep->index;
|
||||
|
||||
for (int i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) {
|
||||
lcl_ep->rem_info.rem_qps[i].rem_psn = lcl_ep->qps[i].qp->lcl_psn;
|
||||
lcl_ep->rem_info.rem_qps[i].rem_qp_num = lcl_ep->qps[i].qp->lcl_qp->qp_num;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != (rc = udcm_rc_qps_to_rts (lcl_ep))) {
|
||||
BTL_VERBOSE(("error moving loopback endpoint qps to RTS"));
|
||||
break;
|
||||
}
|
||||
|
||||
lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
|
||||
|
||||
rc = udcm_finish_connection (lcl_ep);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
} while (0);
|
||||
opal_mutex_unlock (&udep->udep_lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
{
|
||||
udcm_endpoint_t *udep = lcl_ep->endpoint_local_cpc_data =
|
||||
@ -606,16 +518,6 @@ static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep)
|
||||
|
||||
OBJ_CONSTRUCT(&udep->udep_lock, opal_mutex_t);
|
||||
|
||||
if (lcl_ep->endpoint_proc->proc_opal == opal_proc_local_get ()) {
|
||||
/* go ahead and try to create a loopback queue pair */
|
||||
#if HAVE_XRC
|
||||
if (mca_btl_openib_component.num_xrc_qps > 0) {
|
||||
return udcm_endpoint_init_self_xrc (lcl_ep);
|
||||
} else
|
||||
#endif
|
||||
return udcm_endpoint_init_self (lcl_ep);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1167,9 +1069,6 @@ static inline int udcm_rc_qp_to_init (struct ibv_qp *qp,
|
||||
attr.pkey_index = btl->pkey_index;
|
||||
attr.port_num = btl->port_num;
|
||||
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT |
|
||||
IBV_QP_ACCESS_FLAGS;
|
||||
|
||||
@ -2408,7 +2307,7 @@ static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep)
|
||||
/* mark: xrc send qp */
|
||||
|
||||
/* Send qp connect */
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
|
||||
static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl;
|
||||
struct ibv_qp_attr attr;
|
||||
@ -2417,7 +2316,7 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t
|
||||
int ret;
|
||||
|
||||
BTL_VERBOSE(("Connecting send qp: %p, remote qp: %d", (void *)lcl_ep->qps[0].qp->lcl_qp,
|
||||
rem_qp_num));
|
||||
msg_hdr->data.xres.rem_qp_num));
|
||||
assert(NULL != lcl_ep->qps);
|
||||
qp = lcl_ep->qps[0].qp->lcl_qp;
|
||||
psn = lcl_ep->qps[0].qp->lcl_psn;
|
||||
@ -2427,8 +2326,8 @@ static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t
|
||||
attr.qp_state = IBV_QPS_RTR;
|
||||
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
|
||||
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
|
||||
attr.dest_qp_num = rem_qp_num;
|
||||
attr.rq_psn = rem_psn;
|
||||
attr.dest_qp_num = msg_hdr->data.xres.rem_qp_num;
|
||||
attr.rq_psn = msg_hdr->data.xres.rem_psn;
|
||||
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
|
||||
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
|
||||
attr.ah_attr.is_global = 0;
|
||||
@ -2561,9 +2460,6 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep)
|
||||
attr.pkey_index = openib_btl->pkey_index;
|
||||
attr.port_num = openib_btl->port_num;
|
||||
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
ret = ibv_modify_qp(*qp, &attr,
|
||||
IBV_QP_STATE |
|
||||
IBV_QP_PKEY_INDEX |
|
||||
@ -2605,7 +2501,7 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep)
|
||||
}
|
||||
|
||||
/* Recv qp create */
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn)
|
||||
static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr)
|
||||
{
|
||||
mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl;
|
||||
struct ibv_qp_init_attr qp_init_attr;
|
||||
@ -2629,11 +2525,6 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t
|
||||
attr.pkey_index = openib_btl->pkey_index;
|
||||
attr.port_num = openib_btl->port_num;
|
||||
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
|
||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||
attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
|
||||
#endif
|
||||
|
||||
ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain,
|
||||
lcl_ep->xrc_recv_qp_num, &attr,
|
||||
IBV_QP_STATE | IBV_QP_PKEY_INDEX |
|
||||
@ -2649,8 +2540,8 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t
|
||||
attr.qp_state = IBV_QPS_RTR;
|
||||
attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ?
|
||||
openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu;
|
||||
attr.dest_qp_num = rem_qp_num;
|
||||
attr.rq_psn = rem_psn;
|
||||
attr.dest_qp_num = msg_hdr->data.xreq.rem_qp_num;
|
||||
attr.rq_psn = msg_hdr->data.xreq.rem_psn;
|
||||
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
|
||||
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
|
||||
attr.ah_attr.is_global = 0;
|
||||
@ -2824,7 +2715,7 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg
|
||||
|
||||
response_type = UDCM_MESSAGE_XRESPONSE;
|
||||
|
||||
rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr->data.xreq.rem_qp_num, msg_hdr->data.xreq.rem_psn);
|
||||
rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
break;
|
||||
}
|
||||
@ -2870,7 +2761,7 @@ static int udcm_xrc_handle_xresponse (mca_btl_openib_endpoint_t *lcl_ep, udcm_ms
|
||||
|
||||
udep->recv_resp = true;
|
||||
|
||||
rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr->data.xres.rem_qp_num, msg_hdr->data.xres.rem_psn);
|
||||
rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
mca_btl_openib_endpoint_invoke_error (lcl_ep);
|
||||
}
|
||||
|
@ -183,7 +183,7 @@ mca_btl_portals4_alloc(struct mca_btl_base_module_t* btl_base,
|
||||
}
|
||||
|
||||
frag->md_h = PTL_INVALID_HANDLE;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
|
||||
@ -272,7 +272,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_len = max_data + reserve;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local_count = 1;
|
||||
|
||||
} else {
|
||||
/* no need to pack - rdma operation out of user's buffer */
|
||||
@ -302,7 +302,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
|
||||
frag->segments[0].base.seg_len = max_data;
|
||||
frag->segments[0].base.seg_addr.pval = iov.iov_base;
|
||||
frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1);
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local_count = 1;
|
||||
|
||||
/* either a put or get. figure out which later */
|
||||
OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output,
|
||||
@ -348,7 +348,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base,
|
||||
(void *)frag, frag->me_h, me.start, me.length,
|
||||
me.match_id.phys.nid, me.match_id.phys.pid, me.match_bits));
|
||||
}
|
||||
frag->base.des_segments = &frag->segments[0].base;
|
||||
frag->base.des_local = &frag->segments[0].base;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
@ -390,8 +390,8 @@ mca_btl_portals4_prepare_dst(struct mca_btl_base_module_t* btl_base,
|
||||
frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1);
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_segments = &frag->segments[0].base;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segments[0].base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->md_h = PTL_INVALID_HANDLE;
|
||||
|
@ -725,11 +725,11 @@ mca_btl_portals4_component_progress(void)
|
||||
frag = ev.user_ptr;
|
||||
tag = (unsigned char) (ev.hdr_data);
|
||||
|
||||
frag->base.des_segments = seg;
|
||||
frag->base.des_local = seg;
|
||||
seg[0].seg_addr.pval = ev.start;
|
||||
seg[0].seg_len = ev.mlength;
|
||||
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local_count = 1;
|
||||
|
||||
reg = mca_btl_base_active_message_trigger + tag;
|
||||
OPAL_OUTPUT_VERBOSE((50, opal_btl_base_framework.framework_output,
|
||||
|
@ -26,8 +26,8 @@ static void
|
||||
mca_btl_portals4_frag_common_send_constructor(mca_btl_portals4_frag_t* frag)
|
||||
{
|
||||
frag->base.des_flags = 0;
|
||||
frag->base.des_segments = &frag->segments[0].base;
|
||||
frag->base.des_segment_count = 2;
|
||||
frag->base.des_local = &frag->segments[0].base;
|
||||
frag->base.des_local_count = 2;
|
||||
|
||||
frag->segments[0].base.seg_addr.pval = frag + 1;
|
||||
frag->segments[0].base.seg_len = frag->size;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -197,21 +197,29 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
|
||||
* Initiate a get operation.
|
||||
*
|
||||
* location: btl_scif_get.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
int
|
||||
mca_btl_scif_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
*
|
||||
* location: btl_scif_put.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
int
|
||||
mca_btl_scif_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
|
||||
mca_btl_base_descriptor_t *
|
||||
mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
|
||||
@ -220,25 +228,9 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
|
||||
|
||||
int mca_btl_scif_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
struct mca_btl_scif_reg_t;
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
/** scif offset */
|
||||
off_t scif_offset;
|
||||
/** base address of this scif region */
|
||||
uintptr_t scif_base;
|
||||
};
|
||||
|
||||
struct mca_btl_scif_registration_handle_t {
|
||||
mca_btl_base_registration_handle_t btl_handle;
|
||||
struct mca_btl_scif_reg_t *reg;
|
||||
};
|
||||
typedef struct mca_btl_scif_registration_handle_t mca_btl_scif_registration_handle_t;
|
||||
|
||||
typedef struct mca_btl_scif_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
/** per-endpoint btl handles for this registration */
|
||||
mca_btl_scif_registration_handle_t *handles;
|
||||
off_t *registrations;
|
||||
} mca_btl_scif_reg_t;
|
||||
|
||||
/* Global structures */
|
||||
|
@ -165,14 +165,14 @@ static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
|
||||
/* register the fragment with all connected endpoints */
|
||||
for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) {
|
||||
if ((off_t)-1 != scif_reg->handles[i].btl_handle.scif_offset &&
|
||||
if ((off_t)-1 != scif_reg->registrations[i] &&
|
||||
MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
|
||||
(void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd,
|
||||
scif_reg->handles[i].btl_handle.scif_offset, size);
|
||||
scif_reg->registrations[i], size);
|
||||
}
|
||||
}
|
||||
|
||||
free (scif_reg->handles);
|
||||
free (scif_reg->registrations);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -184,22 +184,17 @@ static int scif_reg_mem (void *reg_data, void *base, size_t size,
|
||||
int rc = OPAL_SUCCESS;
|
||||
unsigned int i;
|
||||
|
||||
scif_reg->handles = calloc (mca_btl_scif_module.endpoint_count, sizeof (scif_reg->handles[0]));
|
||||
|
||||
/* intialize all scif offsets to -1 and initialize the pointer back to the mpool registration */
|
||||
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
|
||||
scif_reg->handles[i].btl_handle.scif_offset = -1;
|
||||
scif_reg->handles[i].btl_handle.scif_base = (intptr_t) base;
|
||||
scif_reg->handles[i].reg = scif_reg;
|
||||
}
|
||||
scif_reg->registrations = calloc (mca_btl_scif_module.endpoint_count,
|
||||
sizeof (off_t));
|
||||
memset (scif_reg->registrations, -1, mca_btl_scif_module.endpoint_count * sizeof (off_t));
|
||||
|
||||
/* register the pointer with all connected endpoints */
|
||||
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
|
||||
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
|
||||
scif_reg->handles[i].btl_handle.scif_offset = scif_register (mca_btl_scif_module.endpoints[i].scif_epd,
|
||||
base, size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
if (SCIF_REGISTER_FAILED == scif_reg->handles[i].btl_handle.scif_offset) {
|
||||
scif_reg->registrations[i] = scif_register(mca_btl_scif_module.endpoints[i].scif_epd,
|
||||
base, size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
if (SCIF_REGISTER_FAILED == scif_reg->registrations[i]) {
|
||||
/* cleanup */
|
||||
scif_dereg_mem (reg_data, reg);
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -171,7 +171,7 @@ static int btl_scif_component_register(void)
|
||||
mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
|
||||
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
|
||||
mca_btl_scif_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
mca_btl_scif_module.super.btl_seg_size = sizeof (mca_btl_scif_segment_t);
|
||||
|
||||
mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */
|
||||
mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */
|
||||
@ -329,11 +329,11 @@ static int mca_btl_scif_progress_recvs (mca_btl_base_endpoint_t *ep)
|
||||
* the fragment without introducing another copy here. this
|
||||
* limitation has not appeared to cause any performance
|
||||
* problems. */
|
||||
frag.base.des_segment_count = 1;
|
||||
frag.segments[0].seg_len = hdr->size;
|
||||
frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
|
||||
frag.base.des_local_count = 1;
|
||||
frag.segments[0].base.seg_len = hdr->size;
|
||||
frag.segments[0].base.seg_addr.pval = (void *) (hdr + 1);
|
||||
|
||||
frag.base.des_segments = frag.segments;
|
||||
frag.base.des_local = &frag.segments[0].base;
|
||||
|
||||
/* call the registered callback function */
|
||||
reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata);
|
||||
|
@ -15,13 +15,13 @@
|
||||
static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag)
|
||||
{
|
||||
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
|
||||
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
|
||||
}
|
||||
|
||||
static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag)
|
||||
{
|
||||
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
|
||||
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t,
|
||||
|
@ -15,6 +15,16 @@
|
||||
#include "btl_scif.h"
|
||||
#include "btl_scif_endpoint.h"
|
||||
|
||||
typedef struct mca_btl_scif_segment_t {
|
||||
mca_btl_base_segment_t base;
|
||||
|
||||
/* scif offset */
|
||||
off_t scif_offset;
|
||||
|
||||
/* original pointer */
|
||||
uint64_t orig_ptr;
|
||||
} mca_btl_scif_segment_t;
|
||||
|
||||
typedef struct mca_btl_scif_frag_hdr_t {
|
||||
#if defined(SCIF_USE_SEQ)
|
||||
uint32_t seq;
|
||||
@ -31,7 +41,7 @@ typedef void (*frag_cb_t) (struct mca_btl_scif_base_frag_t *, int);
|
||||
typedef struct mca_btl_scif_base_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_scif_frag_hdr_t hdr;
|
||||
mca_btl_base_segment_t segments[2];
|
||||
mca_btl_scif_segment_t segments[2];
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_scif_reg_t *registration;
|
||||
ompi_free_list_t *my_list;
|
||||
@ -68,9 +78,9 @@ static inline int mca_btl_scif_frag_return (mca_btl_scif_base_frag_t *frag)
|
||||
frag->registration = NULL;
|
||||
}
|
||||
|
||||
frag->segments[0].seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].seg_len = 0;
|
||||
frag->segments[1].seg_len = 0;
|
||||
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
|
||||
frag->segments[0].base.seg_len = 0;
|
||||
frag->segments[1].base.seg_len = 0;
|
||||
|
||||
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag);
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -20,13 +20,18 @@
|
||||
|
||||
/**
|
||||
* Initiate a get operation.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc, mark, scif_flags = 0;
|
||||
int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des) {
|
||||
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_remote;
|
||||
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_local;
|
||||
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
|
||||
int rc, mark, flags = 0;
|
||||
off_t roffset, loffset;
|
||||
#if defined(SCIF_TIMING)
|
||||
struct timespec ts;
|
||||
@ -36,27 +41,30 @@ int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t
|
||||
mca_btl_scif_component.get_count++;
|
||||
#endif
|
||||
|
||||
BTL_VERBOSE(("Using DMA Get from remote address %" PRIx64 " to local address %p",
|
||||
remote_address, local_address));
|
||||
BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des,
|
||||
(unsigned long) src->scif_offset));
|
||||
|
||||
roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
|
||||
loffset = local_handle->scif_offset + (off_t)((intptr_t)local_address - local_handle->scif_base);
|
||||
roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
|
||||
loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
|
||||
|
||||
if (mca_btl_scif_component.rma_use_cpu) {
|
||||
scif_flags = SCIF_RMA_USECPU;
|
||||
flags = SCIF_RMA_USECPU;
|
||||
}
|
||||
|
||||
if (mca_btl_scif_component.rma_sync) {
|
||||
scif_flags |= SCIF_RMA_SYNC;
|
||||
flags |= SCIF_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* start the read */
|
||||
rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
|
||||
rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags);
|
||||
if (OPAL_UNLIKELY(-1 == rc)) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
if (!(scif_flags & SCIF_RMA_SYNC)) {
|
||||
/* always call the callback function */
|
||||
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
if (!(flags & SCIF_RMA_SYNC)) {
|
||||
/* according to the scif documentation is is better to use a fence rather
|
||||
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
|
||||
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
|
||||
@ -68,8 +76,8 @@ int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t
|
||||
mca_btl_scif_component.get_time_max, ts);
|
||||
#endif
|
||||
|
||||
/* always call the callback function */
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
/* since we completed the fence the RMA operation is complete */
|
||||
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -24,14 +24,17 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
|
||||
static int
|
||||
mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl);
|
||||
|
||||
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags);
|
||||
static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
|
||||
static mca_btl_base_descriptor_t *
|
||||
mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
opal_convertor_t *convertor, uint8_t order,
|
||||
size_t reserve, size_t *size, uint32_t flags);
|
||||
|
||||
static struct mca_btl_base_descriptor_t *
|
||||
mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve, size_t *size,
|
||||
uint32_t flags);
|
||||
@ -45,12 +48,11 @@ mca_btl_scif_module_t mca_btl_scif_module = {
|
||||
.btl_alloc = mca_btl_scif_alloc,
|
||||
.btl_free = mca_btl_scif_free,
|
||||
.btl_prepare_src = mca_btl_scif_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_scif_prepare_dst,
|
||||
.btl_send = mca_btl_scif_send,
|
||||
.btl_sendi = mca_btl_scif_sendi,
|
||||
.btl_put = mca_btl_scif_put,
|
||||
.btl_get = mca_btl_scif_get,
|
||||
.btl_register_mem = mca_btl_scif_register_mem,
|
||||
.btl_deregister_mem = mca_btl_scif_deregister_mem,
|
||||
}
|
||||
};
|
||||
|
||||
@ -161,10 +163,10 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
|
||||
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = order;
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segments[0].base;
|
||||
frag->base.des_local_count = 1;
|
||||
|
||||
frag->segments[0].seg_len = size;
|
||||
frag->segments[0].base.seg_len = size;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
@ -176,19 +178,16 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl,
|
||||
return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des);
|
||||
}
|
||||
|
||||
static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *base, size_t size, uint32_t flags)
|
||||
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *data_ptr, size_t size,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
uint8_t order, uint32_t flags)
|
||||
{
|
||||
mca_btl_scif_base_frag_t *frag;
|
||||
mca_btl_scif_reg_t *scif_reg;
|
||||
int rc;
|
||||
|
||||
if (MCA_BTL_ENDPOINT_ANY == endpoint) {
|
||||
/* it probably isn't possible to support registering memory to use with any endpoint so
|
||||
* return NULL */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
|
||||
/* the endpoint needs to be connected before the fragment can be
|
||||
* registered. */
|
||||
@ -199,36 +198,67 @@ static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca
|
||||
}
|
||||
}
|
||||
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0,
|
||||
(mca_mpool_base_registration_t **) &scif_reg);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
(void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* register the memory location with this peer if it isn't already */
|
||||
if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) {
|
||||
size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1;
|
||||
if (NULL == registration) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, size, 0,
|
||||
(mca_mpool_base_registration_t **) ®istration);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
mca_btl_scif_frag_return (frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* NTH: until we determine a way to pass permissions to the mpool just make all segments
|
||||
* read/write */
|
||||
scif_reg->handles[endpoint->id].btl_handle.scif_offset =
|
||||
scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
frag->registration = (mca_btl_scif_reg_t *) registration;
|
||||
}
|
||||
|
||||
scif_reg = (mca_btl_scif_reg_t *) registration;
|
||||
|
||||
/* register the memory location with this peer if it isn't already */
|
||||
if ((off_t) -1 == scif_reg->registrations[endpoint->id]) {
|
||||
size_t seg_size = (size_t)((uintptr_t) registration->bound - (uintptr_t) registration->base) + 1;
|
||||
scif_reg->registrations[endpoint->id] = scif_register (endpoint->scif_epd, registration->base,
|
||||
seg_size, 0, SCIF_PROT_READ |
|
||||
SCIF_PROT_WRITE, 0);
|
||||
BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu",
|
||||
(unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset));
|
||||
(unsigned long) scif_reg->registrations[endpoint->id]));
|
||||
}
|
||||
|
||||
return &scif_reg->handles[endpoint->id].btl_handle;
|
||||
if (OPAL_UNLIKELY((off_t) -1 == scif_reg->registrations[endpoint->id])) {
|
||||
mca_btl_scif_frag_return (frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
|
||||
frag->segments[0].base.seg_len = size;
|
||||
frag->segments[0].scif_offset = scif_reg->registrations[endpoint->id] +
|
||||
(off_t) ((ptrdiff_t) data_ptr - (ptrdiff_t) registration->base);
|
||||
/* save the original pointer so the offset can be adjusted if needed (this is
|
||||
* required for osc/rdma) */
|
||||
frag->segments[0].orig_ptr = (uint64_t)(uintptr_t) data_ptr;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.des_local_count = 1;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
||||
static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma_conv (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t *size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_scif_registration_handle_t *scif_handle = (mca_btl_scif_registration_handle_t *) handle;
|
||||
mca_btl_scif_reg_t *scif_reg = scif_handle->reg;
|
||||
void *data_ptr;
|
||||
|
||||
btl->btl_mpool->mpool_deregister (btl->btl_mpool, &scif_reg->base);
|
||||
opal_convertor_get_current_pointer (convertor, &data_ptr);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
return mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration, order, flags);
|
||||
}
|
||||
|
||||
static inline struct mca_btl_base_descriptor_t *
|
||||
@ -256,10 +286,10 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segments[0].seg_len = reserve;
|
||||
frag->segments[1].seg_addr.pval = data_ptr;
|
||||
frag->segments[1].seg_len = *size;
|
||||
frag->base.des_segment_count = 2;
|
||||
frag->segments[0].base.seg_len = reserve;
|
||||
frag->segments[1].base.seg_addr.pval = data_ptr;
|
||||
frag->segments[1].base.seg_len = *size;
|
||||
frag->base.des_local_count = 2;
|
||||
} else {
|
||||
/* buffered send */
|
||||
(void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
|
||||
@ -269,7 +299,7 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
|
||||
|
||||
if (*size) {
|
||||
iov.iov_len = *size;
|
||||
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve);
|
||||
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].base.seg_addr.pval + reserve);
|
||||
|
||||
rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
|
||||
if (OPAL_UNLIKELY(rc < 0)) {
|
||||
@ -279,22 +309,37 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
|
||||
*size = max_size;
|
||||
}
|
||||
|
||||
frag->segments[0].seg_len = reserve + *size;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->segments[0].base.seg_len = reserve + *size;
|
||||
frag->base.des_local_count = 1;
|
||||
}
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.des_local = &frag->segments->base;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
struct opal_convertor_t *convertor,
|
||||
uint8_t order, size_t reserve, size_t *size,
|
||||
uint32_t flags)
|
||||
{
|
||||
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags);
|
||||
if (OPAL_LIKELY(reserve)) {
|
||||
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor,
|
||||
order, reserve, size, flags);
|
||||
} else {
|
||||
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
mca_mpool_base_registration_t *registration,
|
||||
opal_convertor_t *convertor, uint8_t order,
|
||||
size_t reserve, size_t *size, uint32_t flags)
|
||||
{
|
||||
return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -16,57 +16,63 @@
|
||||
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc, mark, scif_flags = 0;
|
||||
int mca_btl_scif_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des) {
|
||||
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_local;
|
||||
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_remote;
|
||||
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
|
||||
int rc, mark, flags = 0;
|
||||
off_t roffset, loffset;
|
||||
#if defined(SCIF_TIMING)
|
||||
struct timespec ts;
|
||||
|
||||
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
|
||||
|
||||
mca_btl_scif_component.get_count++;
|
||||
mca_btl_scif_component.put_count++;
|
||||
#endif
|
||||
|
||||
BTL_VERBOSE(("Using DMA Put from local address %p to remote address %" PRIx64,
|
||||
local_address, remote_address));
|
||||
BTL_VERBOSE(("Using DMA Put for frag %p", (void *) des));
|
||||
|
||||
roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base);
|
||||
loffset = local_handle->scif_offset + (off_t)((intptr_t) local_address - local_handle->scif_base);
|
||||
roffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
|
||||
loffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
|
||||
|
||||
if (mca_btl_scif_component.rma_use_cpu) {
|
||||
scif_flags = SCIF_RMA_USECPU;
|
||||
flags = SCIF_RMA_USECPU;
|
||||
}
|
||||
|
||||
if (mca_btl_scif_component.rma_sync) {
|
||||
scif_flags |= SCIF_RMA_SYNC;
|
||||
flags |= SCIF_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* start the write */
|
||||
rc = scif_writeto (endpoint->scif_epd, loffset, size, roffset, scif_flags);
|
||||
rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags);
|
||||
rc = scif_writeto (endpoint->scif_epd, loffset, len, roffset, flags);
|
||||
if (OPAL_UNLIKELY(-1 == rc)) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
if (!(scif_flags & SCIF_RMA_SYNC)) {
|
||||
/* according to the scif documentation is is better to use a fence rather
|
||||
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
|
||||
/* always call the callback function */
|
||||
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
/* according to the scif documentation is is better to use a fence rather
|
||||
* than using the SCIF_RMA_SYNC flag with scif_writeto */
|
||||
if (!(flags & SCIF_RMA_SYNC)) {
|
||||
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
|
||||
scif_fence_wait (endpoint->scif_epd, mark);
|
||||
}
|
||||
|
||||
#if defined(SCIF_TIMING)
|
||||
SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time,
|
||||
mca_btl_scif_component.get_time_max, ts);
|
||||
SCIF_UPDATE_TIMER(mca_btl_scif_component.put_time,
|
||||
mca_btl_scif_component.put_time_max, ts);
|
||||
#endif
|
||||
|
||||
/* always call the callback function */
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
/* since we completed the fence the RMA operation is complete */
|
||||
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -118,22 +118,22 @@ static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint,
|
||||
unsigned char * restrict dst;
|
||||
|
||||
BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag,
|
||||
opal_process_name_vpid(OPAL_PROC_MY_NAME), opal_process_name_vpid(endpoint->peer_proc->proc_name), frag->segments[0].seg_len));
|
||||
OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].base.seg_len));
|
||||
|
||||
if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) {
|
||||
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].seg_addr.pval;
|
||||
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].base.seg_addr.pval;
|
||||
#if defined(SCIF_TIMING)
|
||||
struct timespec ts;
|
||||
|
||||
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
|
||||
#endif
|
||||
|
||||
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].seg_len);
|
||||
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].base.seg_len);
|
||||
|
||||
if (frag->segments[1].seg_len) {
|
||||
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].seg_len,
|
||||
frag->segments[1].seg_addr.pval,
|
||||
frag->segments[1].seg_len);
|
||||
if (frag->segments[1].base.seg_len) {
|
||||
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].base.seg_len,
|
||||
frag->segments[1].base.seg_addr.pval,
|
||||
frag->segments[1].base.seg_len);
|
||||
}
|
||||
|
||||
#if defined(SCIF_USE_SEQ)
|
||||
@ -165,7 +165,7 @@ int mca_btl_scif_send (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag)
|
||||
{
|
||||
mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor;
|
||||
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
|
||||
size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len;
|
||||
int rc;
|
||||
|
||||
frag->hdr.tag = tag;
|
||||
@ -223,9 +223,7 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
|
||||
|
||||
rc = mca_btl_scif_send_get_buffer (endpoint, length, &base);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
if (NULL != descriptor) {
|
||||
*descriptor = NULL;
|
||||
}
|
||||
*descriptor = NULL;
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
@ -38,15 +38,13 @@
|
||||
#include "btl_self_frag.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des);
|
||||
|
||||
static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
static int mca_btl_self_get (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des);
|
||||
|
||||
mca_btl_base_module_t mca_btl_self = {
|
||||
.btl_component = &mca_btl_self_component.super,
|
||||
@ -56,6 +54,7 @@ mca_btl_base_module_t mca_btl_self = {
|
||||
.btl_alloc = mca_btl_self_alloc,
|
||||
.btl_free = mca_btl_self_free,
|
||||
.btl_prepare_src = mca_btl_self_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_self_prepare_dst,
|
||||
.btl_send = mca_btl_self_send,
|
||||
.btl_put = mca_btl_self_put,
|
||||
.btl_get = mca_btl_self_get,
|
||||
@ -136,8 +135,8 @@ mca_btl_base_descriptor_t* mca_btl_self_alloc(
|
||||
|
||||
frag->segment.seg_len = size;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.des_segments = &(frag->segment);
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &(frag->segment);
|
||||
frag->base.des_local_count = 1;
|
||||
return (mca_btl_base_descriptor_t*)frag;
|
||||
}
|
||||
|
||||
@ -152,8 +151,10 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
|
||||
{
|
||||
mca_btl_self_frag_t* frag = (mca_btl_self_frag_t*)des;
|
||||
|
||||
frag->base.des_segments = NULL;
|
||||
frag->base.des_segment_count = 0;
|
||||
frag->base.des_local = NULL;
|
||||
frag->base.des_local_count = 0;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
|
||||
if(frag->size == mca_btl_self.btl_eager_limit) {
|
||||
MCA_BTL_SELF_FRAG_RETURN_EAGER(frag);
|
||||
@ -174,6 +175,7 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_descriptor_t*
|
||||
mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -229,11 +231,44 @@ mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
|
||||
*size = max_data;
|
||||
}
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare data for receive.
|
||||
*/
|
||||
struct mca_btl_base_descriptor_t*
|
||||
mca_btl_self_prepare_dst( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags )
|
||||
{
|
||||
mca_btl_self_frag_t* frag;
|
||||
size_t max_data = *size;
|
||||
void *ptr;
|
||||
|
||||
MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* setup descriptor to point directly to user buffer */
|
||||
opal_convertor_get_current_pointer( convertor, &ptr );
|
||||
frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
|
||||
|
||||
frag->segment.seg_len = reserve + max_data;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate a send to the peer.
|
||||
@ -250,6 +285,12 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
|
||||
mca_btl_active_message_callback_t* reg;
|
||||
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
|
||||
/**
|
||||
* We have to set the dst before the call to the function and reset them
|
||||
* after.
|
||||
*/
|
||||
des->des_remote = des->des_local;
|
||||
des->des_remote_count = des->des_local_count;
|
||||
/* upcall */
|
||||
reg = mca_btl_base_active_message_trigger + tag;
|
||||
reg->cbfunc( btl, tag, des, reg->cbdata );
|
||||
@ -264,29 +305,100 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate a put to the peer.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
|
||||
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
static int mca_btl_self_rdma( struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
mca_btl_base_segment_t* src, size_t src_cnt,
|
||||
mca_btl_base_segment_t* dst, size_t dst_cnt)
|
||||
{
|
||||
memcpy ((void *)(intptr_t) remote_address, local_address, size);
|
||||
unsigned char* src_addr = (unsigned char *)(uintptr_t) src->seg_addr.lval;
|
||||
size_t src_len = src->seg_len;
|
||||
unsigned char* dst_addr = (unsigned char *)(uintptr_t) dst->seg_addr.lval;
|
||||
size_t dst_len = dst->seg_len;
|
||||
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
|
||||
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
while(src_len && dst_len) {
|
||||
|
||||
if(src_len == dst_len) {
|
||||
memcpy(dst_addr, src_addr, src_len);
|
||||
|
||||
/* advance src */
|
||||
if(--src_cnt != 0) {
|
||||
src++;
|
||||
src_addr = (unsigned char*)src->seg_addr.pval;
|
||||
src_len = src->seg_len;
|
||||
} else {
|
||||
src_len = 0;
|
||||
}
|
||||
|
||||
/* advance dst */
|
||||
if(--dst_cnt != 0) {
|
||||
dst++;
|
||||
dst_addr = (unsigned char*)dst->seg_addr.pval;
|
||||
dst_len = dst->seg_len;
|
||||
} else {
|
||||
dst_len = 0;
|
||||
}
|
||||
|
||||
} else {
|
||||
size_t bytes = src_len < dst_len ? src_len : dst_len;
|
||||
memcpy(dst_addr, src_addr, bytes);
|
||||
|
||||
/* advance src */
|
||||
src_len -= bytes;
|
||||
if(src_len == 0) {
|
||||
if(--src_cnt != 0) {
|
||||
src++;
|
||||
src_addr = (unsigned char*)src->seg_addr.pval;
|
||||
src_len = src->seg_len;
|
||||
}
|
||||
} else {
|
||||
src_addr += bytes;
|
||||
}
|
||||
|
||||
/* advance dst */
|
||||
dst_len -= bytes;
|
||||
if(dst_len == 0) {
|
||||
if(--dst_cnt != 0) {
|
||||
dst++;
|
||||
dst_addr = (unsigned char*)src->seg_addr.pval;
|
||||
dst_len = src->seg_len;
|
||||
}
|
||||
} else {
|
||||
dst_addr += bytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* rdma completion */
|
||||
des->des_cbfunc( btl, endpoint, des, OPAL_SUCCESS );
|
||||
if( btl_ownership ) {
|
||||
mca_btl_self_free( btl, des );
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
static int mca_btl_self_put (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
memcpy (local_address, (void *)(intptr_t) remote_address, size);
|
||||
return mca_btl_self_rdma (btl, endpoint, des, des->des_local, des->des_local_count,
|
||||
des->des_remote, des->des_remote_count);
|
||||
}
|
||||
|
||||
cbfunc (btl, endpoint, local_address, NULL, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
static int mca_btl_self_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des)
|
||||
{
|
||||
return mca_btl_self_rdma (btl, endpoint, des, des->des_remote, des->des_remote_count,
|
||||
des->des_local, des->des_local_count);
|
||||
}
|
||||
|
||||
int mca_btl_self_ft_event(int state) {
|
||||
|
@ -165,6 +165,24 @@ int mca_btl_self_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
/**
|
||||
* Prepare data for RDMA
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
struct mca_btl_base_descriptor_t* mca_btl_self_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
|
@ -99,6 +99,7 @@ static int mca_btl_self_component_register(void)
|
||||
mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX;
|
||||
mca_btl_self.btl_min_rdma_pipeline_size = 0;
|
||||
mca_btl_self.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
mca_btl_self.btl_seg_size = sizeof (mca_btl_base_segment_t);
|
||||
mca_btl_self.btl_bandwidth = 100;
|
||||
mca_btl_self.btl_latency = 0;
|
||||
mca_btl_base_param_register(&mca_btl_self_component.super.btl_version,
|
||||
|
@ -23,8 +23,8 @@ static inline void mca_btl_self_frag_constructor(mca_btl_self_frag_t* frag)
|
||||
{
|
||||
frag->segment.seg_addr.pval = frag+1;
|
||||
frag->segment.seg_len = (uint32_t)frag->size;
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
}
|
||||
|
||||
|
@ -57,9 +57,6 @@
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/mpool/sm/mpool_sm.h"
|
||||
|
||||
#include "opal/align.h"
|
||||
#include "opal/util/sys_limits.h"
|
||||
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "opal/util/basename.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
@ -84,6 +81,9 @@ mca_btl_sm_t mca_btl_sm = {
|
||||
.btl_alloc = mca_btl_sm_alloc,
|
||||
.btl_free = mca_btl_sm_free,
|
||||
.btl_prepare_src = mca_btl_sm_prepare_src,
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
.btl_prepare_dst = mca_btl_sm_prepare_dst,
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
|
||||
.btl_send = mca_btl_sm_send,
|
||||
.btl_sendi = mca_btl_sm_sendi,
|
||||
.btl_dump = mca_btl_sm_dump,
|
||||
@ -743,6 +743,7 @@ extern int mca_btl_sm_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -827,9 +828,11 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
frag->base.des_segments = &(frag->segment.base);
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &(frag->segment.base);
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_flags = flags;
|
||||
*size = max_data;
|
||||
return &frag->base;
|
||||
@ -947,12 +950,9 @@ int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl,
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (NULL != descriptor) {
|
||||
/* presumably, this code path will never get executed */
|
||||
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
|
||||
payload_size + header_size, flags);
|
||||
}
|
||||
|
||||
/* presumably, this code path will never get executed */
|
||||
*descriptor = mca_btl_sm_alloc( btl, endpoint, order,
|
||||
payload_size + header_size, flags);
|
||||
return OPAL_ERR_RESOURCE_BUSY;
|
||||
}
|
||||
|
||||
@ -1001,87 +1001,51 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
void *base, size_t size, uint32_t flags)
|
||||
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_sm_registration_handle_t *handle;
|
||||
mca_btl_sm_t *sm_btl = (mca_btl_sm_t *) btl;
|
||||
ompi_free_list_item_t *item = NULL;
|
||||
void *ptr;
|
||||
mca_btl_sm_frag_t* frag;
|
||||
|
||||
OMPI_FREE_LIST_GET_MT(&mca_btl_sm_component.registration_handles, item);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
MCA_BTL_SM_FRAG_ALLOC_USER(frag);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
handle = (mca_btl_sm_registration_handle_t *) item;
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
|
||||
struct knem_cmd_create_region knem_cr;
|
||||
struct knem_cmd_param_iovec knem_iov;
|
||||
|
||||
knem_iov.base = (uintptr_t)base & ~(opal_getpagesize() - 1);
|
||||
knem_iov.len = OPAL_ALIGN(size + ((intptr_t) base - knem_iov.base), opal_getpagesize(), intptr_t);
|
||||
knem_cr.iovec_array = (uintptr_t)&knem_iov;
|
||||
knem_cr.iovec_nr = 1;
|
||||
knem_cr.flags = 0;
|
||||
knem_cr.protection = 0;
|
||||
|
||||
if (flags & MCA_BTL_REG_FLAG_REMOTE_READ) {
|
||||
knem_cr.protection |= PROT_READ;
|
||||
}
|
||||
if (flags & MCA_BTL_REG_FLAG_REMOTE_WRITE) {
|
||||
knem_cr.protection |= PROT_WRITE;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) {
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, item);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
handle->btl_handle.data.knem.cookie = knem_cr.cookie;
|
||||
handle->btl_handle.data.knem.base_addr = knem_iov.base;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
/* the pid could be included in a modex but this will work until btl/sm is
|
||||
* deleted */
|
||||
handle->btl_handle.data.pid = getpid ();
|
||||
}
|
||||
|
||||
/* return the public part of the handle */
|
||||
return &handle->btl_handle;
|
||||
frag->segment.base.seg_len = *size;
|
||||
opal_convertor_get_current_pointer( convertor, &ptr );
|
||||
frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr;
|
||||
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_local = (mca_btl_base_segment_t*)&frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_sm_registration_handle_t *sm_handle =
|
||||
(mca_btl_sm_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_sm_registration_handle_t, btl_handle));
|
||||
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
|
||||
(void) ioctl(sm_btl->knem_fd, KNEM_CMD_DESTROY_REGION, &handle->data.knem.cookie);
|
||||
}
|
||||
#endif
|
||||
|
||||
OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, &sm_handle->super);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
|
||||
/**
|
||||
* Initiate an synchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
int btl_ownership;
|
||||
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
|
||||
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
|
||||
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) {
|
||||
@ -1090,12 +1054,12 @@ int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoin
|
||||
|
||||
/* Fill in the ioctl data fields. There's no async completion, so
|
||||
we don't need to worry about getting a slot, etc. */
|
||||
recv_iovec.base = (uintptr_t) local_address;
|
||||
recv_iovec.len = size;
|
||||
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval;
|
||||
recv_iovec.len = dst->base.seg_len;
|
||||
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
|
||||
icopy.local_iovec_nr = 1;
|
||||
icopy.remote_cookie = remote_handle->data.knem.cookie;
|
||||
icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
|
||||
icopy.remote_cookie = src->key;
|
||||
icopy.remote_offset = 0;
|
||||
icopy.write = 0;
|
||||
|
||||
/* Use the DMA flag if knem supports it *and* the segment length
|
||||
@ -1103,7 +1067,7 @@ int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoin
|
||||
value is 0 (i.e., the MCA param was set to 0), the segment size
|
||||
will never be larger than it, so DMA will never be used. */
|
||||
icopy.flags = 0;
|
||||
if (mca_btl_sm_component.knem_dma_min <= size) {
|
||||
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) {
|
||||
icopy.flags = mca_btl_sm_component.knem_dma_flag;
|
||||
}
|
||||
/* synchronous flags only, no need to specify icopy.async_status_index */
|
||||
@ -1121,19 +1085,27 @@ int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoin
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_CMA
|
||||
if (OPAL_LIKELY(mca_btl_sm_component.use_cma)) {
|
||||
char *remote_address, *local_address;
|
||||
int remote_length, local_length;
|
||||
struct iovec local, remote;
|
||||
pid_t remote_pid;
|
||||
int val;
|
||||
|
||||
remote_pid = remote_handle->data.pid;
|
||||
remote.iov_base = (void *) (intptr_t) remote_address;
|
||||
remote.iov_len = size;
|
||||
remote_address = (char *)(uintptr_t) src->base.seg_addr.lval;
|
||||
remote_length = src->base.seg_len;
|
||||
|
||||
local_address = (char *)(uintptr_t) dst->base.seg_addr.lval;
|
||||
local_length = dst->base.seg_len;
|
||||
|
||||
remote_pid = src->key;
|
||||
remote.iov_base = remote_address;
|
||||
remote.iov_len = remote_length;
|
||||
local.iov_base = local_address;
|
||||
local.iov_len = size;
|
||||
local.iov_len = local_length;
|
||||
|
||||
val = process_vm_readv(remote_pid, &local, 1, &remote, 1, 0);
|
||||
|
||||
if (val != size) {
|
||||
if (val != local_length) {
|
||||
if (val<0) {
|
||||
opal_output(0, "mca_btl_sm_get_sync: process_vm_readv failed: %i",
|
||||
errno);
|
||||
@ -1147,7 +1119,15 @@ int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoin
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_sm.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
if (btl_ownership) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -1159,42 +1139,34 @@ int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoin
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
int btl_ownership;
|
||||
mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl;
|
||||
mca_btl_sm_frag_t* frag;
|
||||
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des;
|
||||
mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote;
|
||||
mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_local;
|
||||
struct knem_cmd_inline_copy icopy;
|
||||
struct knem_cmd_param_iovec recv_iovec;
|
||||
|
||||
/* If we have no knem slots available, fall back to synchronous */
|
||||
/* If we have no knem slots available, return
|
||||
TEMP_OUT_OF_RESOURCE */
|
||||
if (sm_btl->knem_status_num_used >=
|
||||
mca_btl_sm_component.knem_max_simultaneous) {
|
||||
return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
|
||||
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* allocate a fragment to keep track of this transaction */
|
||||
MCA_BTL_SM_FRAG_ALLOC_USER(frag);
|
||||
if (OPAL_UNLIKELY(NULL == frag)) {
|
||||
return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle,
|
||||
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
|
||||
}
|
||||
|
||||
/* fill in callback data */
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.context = cbcontext;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.local_address = local_address;
|
||||
frag->cb.local_handle = local_handle;
|
||||
|
||||
/* We have a slot, so fill in the data fields. Bump the
|
||||
first_avail and num_used counters. */
|
||||
recv_iovec.base = (uintptr_t) local_address;
|
||||
recv_iovec.len = size;
|
||||
recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval;
|
||||
recv_iovec.len = dst->base.seg_len;
|
||||
icopy.local_iovec_array = (uintptr_t)&recv_iovec;
|
||||
icopy.local_iovec_nr = 1;
|
||||
icopy.write = 0;
|
||||
@ -1204,13 +1176,13 @@ int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoi
|
||||
sm_btl->knem_status_first_avail = 0;
|
||||
}
|
||||
++sm_btl->knem_status_num_used;
|
||||
icopy.remote_cookie = remote_handle->data.knem.cookie;
|
||||
icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr;
|
||||
icopy.remote_cookie = src->key;
|
||||
icopy.remote_offset = 0;
|
||||
|
||||
/* Use the DMA flag if knem supports it *and* the segment length
|
||||
is greater than the cutoff */
|
||||
icopy.flags = KNEM_FLAG_ASYNCDMACOMPLETE;
|
||||
if (mca_btl_sm_component.knem_dma_min <= size) {
|
||||
if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) {
|
||||
icopy.flags = mca_btl_sm_component.knem_dma_flag;
|
||||
}
|
||||
|
||||
@ -1218,11 +1190,19 @@ int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoi
|
||||
if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd,
|
||||
KNEM_CMD_INLINE_COPY, &icopy))) {
|
||||
if (icopy.current_status != KNEM_STATUS_PENDING) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
/* request completed synchronously */
|
||||
|
||||
/* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */
|
||||
cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
|
||||
|
||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_sm.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
if (btl_ownership) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
}
|
||||
|
||||
--sm_btl->knem_status_num_used;
|
||||
++sm_btl->knem_status_first_used;
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -12,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -183,10 +182,6 @@ struct mca_btl_sm_component_t {
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
/* Knem capabilities info */
|
||||
struct knem_cmd_info knem_info;
|
||||
#endif
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
/** registration handles to hold knem cookies */
|
||||
ompi_free_list_t registration_handles;
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
/** MCA: should we be using knem or not? neg=try but continue if
|
||||
@ -466,6 +461,7 @@ extern int mca_btl_sm_free(
|
||||
struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -508,20 +504,30 @@ extern int mca_btl_sm_send(
|
||||
/*
|
||||
* Synchronous knem/cma get
|
||||
*/
|
||||
int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
extern int mca_btl_sm_get_sync(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des );
|
||||
|
||||
extern struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
/*
|
||||
* Asynchronous knem get
|
||||
*/
|
||||
int mca_btl_sm_get_async (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
extern int mca_btl_sm_get_async(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* des );
|
||||
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
@ -552,31 +558,6 @@ void mca_btl_sm_component_event_thread(opal_object_t*);
|
||||
#define MCA_BTL_SM_SIGNAL_PEER(peer)
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
union {
|
||||
struct {
|
||||
uint64_t cookie;
|
||||
intptr_t base_addr;
|
||||
} knem;
|
||||
pid_t pid;
|
||||
} data;
|
||||
};
|
||||
|
||||
struct mca_btl_sm_registration_handle_t {
|
||||
ompi_free_list_item_t super;
|
||||
mca_btl_base_registration_handle_t btl_handle;
|
||||
};
|
||||
typedef struct mca_btl_sm_registration_handle_t mca_btl_sm_registration_handle_t;
|
||||
|
||||
mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
void *base, size_t size, uint32_t flags);
|
||||
|
||||
int mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle);
|
||||
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -67,10 +67,6 @@
|
||||
#include "opal/mca/common/cuda/common_cuda.h"
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
static OBJ_CLASS_INSTANCE(mca_btl_sm_registration_handle_t, ompi_free_list_item_t, NULL, NULL);
|
||||
#endif
|
||||
|
||||
static int mca_btl_sm_component_open(void);
|
||||
static int mca_btl_sm_component_close(void);
|
||||
static int sm_register(void);
|
||||
@ -255,13 +251,10 @@ static int sm_register(void)
|
||||
mca_btl_sm.super.btl_rdma_pipeline_frag_size = 64*1024;
|
||||
mca_btl_sm.super.btl_min_rdma_pipeline_size = 64*1024;
|
||||
mca_btl_sm.super.btl_flags = MCA_BTL_FLAGS_SEND;
|
||||
mca_btl_sm.super.btl_seg_size = sizeof (mca_btl_sm_segment_t);
|
||||
mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */
|
||||
mca_btl_sm.super.btl_latency = 1; /* Microsecs */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
mca_btl_sm.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
#endif
|
||||
|
||||
/* Call the BTL based to register its MCA params */
|
||||
mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version,
|
||||
&mca_btl_sm.super);
|
||||
@ -302,11 +295,6 @@ static int mca_btl_sm_component_open(void)
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t);
|
||||
|
||||
mca_btl_sm_component.sm_seg = NULL;
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.registration_handles, ompi_free_list_t);
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
mca_btl_sm.knem_fd = -1;
|
||||
mca_btl_sm.knem_status_array = NULL;
|
||||
@ -344,10 +332,6 @@ static int mca_btl_sm_component_close(void)
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_KNEM */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA
|
||||
OBJ_DESTRUCT(&mca_btl_sm_component.registration_handles);
|
||||
#endif
|
||||
|
||||
OBJ_DESTRUCT(&mca_btl_sm_component.sm_lock);
|
||||
/**
|
||||
* We don't have to destroy the fragment lists. They are allocated
|
||||
@ -920,9 +904,6 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
} else {
|
||||
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
|
||||
}
|
||||
|
||||
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
|
||||
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
|
||||
}
|
||||
#else
|
||||
/* If the user explicitly asked for knem and we can't provide it,
|
||||
@ -937,8 +918,6 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
/* Will only ever have either cma or knem enabled at runtime
|
||||
so no problems with accidentally overwriting this set earlier */
|
||||
mca_btl_sm.super.btl_get = mca_btl_sm_get_sync;
|
||||
mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem;
|
||||
mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem;
|
||||
}
|
||||
#else
|
||||
/* If the user explicitly asked for CMA and we can't provide itm
|
||||
@ -952,21 +931,6 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
}
|
||||
#endif /* OPAL_BTL_SM_HAVE_CMA */
|
||||
|
||||
#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA
|
||||
if (mca_btl_sm_component.use_cma || mca_btl_sm_component.use_knem) {
|
||||
rc = ompi_free_list_init_new (&mca_btl_sm_component.registration_handles,
|
||||
sizeof (mca_btl_sm_registration_handle_t),
|
||||
8, OBJ_CLASS(mca_btl_sm_registration_handle_t),
|
||||
0, 0, mca_btl_sm_component.sm_free_list_num,
|
||||
mca_btl_sm_component.sm_free_list_max,
|
||||
mca_btl_sm_component.sm_free_list_inc, NULL);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
free (btls);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return btls;
|
||||
|
||||
no_knem:
|
||||
@ -999,7 +963,6 @@ mca_btl_sm_component_init(int *num_btls,
|
||||
/* disable get when not using knem or cma */
|
||||
mca_btl_sm.super.btl_get = NULL;
|
||||
mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_GET;
|
||||
mca_btl_sm_component.use_knem = 0;
|
||||
}
|
||||
|
||||
/* Otherwise, use_knem was 0 (and we didn't get here) or use_knem
|
||||
@ -1127,8 +1090,8 @@ int mca_btl_sm_component_progress(void)
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_sm_hdr_t);
|
||||
seg.seg_len = hdr->len;
|
||||
Frag.base.des_segment_count = 1;
|
||||
Frag.base.des_segments = &seg;
|
||||
Frag.base.des_local_count = 1;
|
||||
Frag.base.des_local = &seg;
|
||||
reg->cbfunc(&mca_btl_sm.super, hdr->tag, &(Frag.base),
|
||||
reg->cbdata);
|
||||
/* return the fragment */
|
||||
@ -1213,14 +1176,22 @@ int mca_btl_sm_component_progress(void)
|
||||
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
|
||||
if (KNEM_STATUS_SUCCESS ==
|
||||
mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) {
|
||||
int btl_ownership;
|
||||
|
||||
/* Handle the completed fragment */
|
||||
frag =
|
||||
mca_btl_sm.knem_frag_array[mca_btl_sm.knem_status_first_used];
|
||||
frag->cb.func (&mca_btl_sm.super, frag->endpoint,
|
||||
frag->cb.local_address, frag->cb.local_handle,
|
||||
frag->cb.context, frag->cb.data, OPAL_SUCCESS);
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
btl_ownership = (frag->base.des_flags &
|
||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||
if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK &
|
||||
frag->base.des_flags)) {
|
||||
frag->base.des_cbfunc(&mca_btl_sm.super,
|
||||
frag->endpoint, &frag->base,
|
||||
OPAL_SUCCESS);
|
||||
}
|
||||
if (btl_ownership) {
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
}
|
||||
|
||||
/* Bump counters, loop around the circular buffer if
|
||||
necessary */
|
||||
|
@ -10,8 +10,6 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
|
@ -31,8 +31,8 @@ static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag)
|
||||
frag->hdr->my_smp_rank = mca_btl_sm_component.my_smp_rank;
|
||||
}
|
||||
frag->segment.base.seg_len = frag->size;
|
||||
frag->base.des_segments = &frag->segment.base;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segment.base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
@ -12,8 +11,6 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -67,16 +64,6 @@ struct mca_btl_sm_frag_t {
|
||||
/* pointer written to the FIFO, this is the base of the shared memory region */
|
||||
mca_btl_sm_hdr_t *hdr;
|
||||
ompi_free_list_t* my_list;
|
||||
#if OPAL_BTL_SM_HAVE_KNEM
|
||||
/* rdma callback data. required for async get */
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
void *local_address;
|
||||
struct mca_btl_base_registration_handle_t *local_handle;
|
||||
void *context;
|
||||
void *data;
|
||||
} cb;
|
||||
#endif
|
||||
};
|
||||
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t;
|
||||
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag1_t;
|
||||
|
@ -832,8 +832,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
|
||||
|
||||
}
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
frag->base.des_segments = &(frag->segment.base);
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &(frag->segment.base);
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
@ -1045,8 +1045,8 @@ struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
|
||||
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_segments = &frag->segment.base;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segment.base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
return &frag->base;
|
||||
}
|
||||
@ -1059,7 +1059,7 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_descriptor_t* descriptor)
|
||||
{
|
||||
mca_btl_smcuda_segment_t *src_seg = (mca_btl_smcuda_segment_t *) descriptor->des_remote;
|
||||
mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_segments;
|
||||
mca_btl_smcuda_segment_t *dst_seg = (mca_btl_smcuda_segment_t *) descriptor->des_local;
|
||||
mca_mpool_common_cuda_reg_t rget_reg;
|
||||
mca_mpool_common_cuda_reg_t *reg_ptr = &rget_reg;
|
||||
int btl_ownership;
|
||||
|
@ -691,7 +691,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
|
||||
mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
|
||||
mca_btl_base_segment_t* segments = des->des_segments;
|
||||
mca_btl_base_segment_t* segments = des->des_local;
|
||||
|
||||
/* Use the rank of the peer that sent the data to get to the endpoint
|
||||
* structure. This is needed for PML callback. */
|
||||
@ -1065,8 +1065,8 @@ int mca_btl_smcuda_component_progress(void)
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
seg.seg_addr.pval = ((char *)hdr) + sizeof(mca_btl_smcuda_hdr_t);
|
||||
seg.seg_len = hdr->len;
|
||||
Frag.base.des_segment_count = 1;
|
||||
Frag.base.des_segments = &seg;
|
||||
Frag.base.des_local_count = 1;
|
||||
Frag.base.des_local = &seg;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
Frag.hdr = hdr; /* needed for peer rank in control messages */
|
||||
#endif /* OPAL_CUDA_SUPPORT */
|
||||
|
@ -32,8 +32,8 @@ static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t*
|
||||
frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
|
||||
}
|
||||
frag->segment.base.seg_len = frag->size;
|
||||
frag->base.des_segments = &frag->segment.base;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segment.base;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
frag->registration = NULL;
|
||||
|
@ -42,6 +42,7 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
|
||||
.btl_alloc = mca_btl_tcp_alloc,
|
||||
.btl_free = mca_btl_tcp_free,
|
||||
.btl_prepare_src = mca_btl_tcp_prepare_src,
|
||||
.btl_prepare_dst = mca_btl_tcp_prepare_dst,
|
||||
.btl_send = mca_btl_tcp_send,
|
||||
.btl_put = mca_btl_tcp_put,
|
||||
.btl_dump = mca_btl_base_dump,
|
||||
@ -169,8 +170,8 @@ mca_btl_base_descriptor_t* mca_btl_tcp_alloc(
|
||||
frag->segments[0].seg_len = size;
|
||||
frag->segments[0].seg_addr.pval = frag+1;
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = frag->segments;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->btl = (mca_btl_tcp_module_t*)btl;
|
||||
@ -201,6 +202,7 @@ int mca_btl_tcp_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -236,7 +238,7 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
frag->segments[0].seg_addr.pval = (frag + 1);
|
||||
frag->segments[0].seg_len = reserve;
|
||||
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local_count = 1;
|
||||
if(opal_convertor_need_buffers(convertor)) {
|
||||
|
||||
if (max_data + reserve > frag->size) {
|
||||
@ -266,16 +268,66 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
|
||||
frag->segments[1].seg_addr.pval = iov.iov_base;
|
||||
frag->segments[1].seg_len = max_data;
|
||||
frag->base.des_segment_count = 2;
|
||||
frag->base.des_local_count = 2;
|
||||
}
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_local = frag->segments;
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
*size = max_data;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Prepare a descriptor for send/rdma using the supplied
|
||||
* convertor. If the convertor references data that is contigous,
|
||||
* the descriptor may simply point to the user buffer. Otherwise,
|
||||
* this routine is responsible for allocating buffer space and
|
||||
* packing if required.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL peer addressing
|
||||
* @param convertor (IN) Data type convertor
|
||||
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
|
||||
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
|
||||
*/
|
||||
|
||||
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_mpool_base_registration_t* registration,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags)
|
||||
{
|
||||
mca_btl_tcp_frag_t* frag;
|
||||
|
||||
if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) { /* limit the size to what we support */
|
||||
*size = (size_t)UINT32_MAX;
|
||||
}
|
||||
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segments->seg_len = *size;
|
||||
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments->seg_addr.pval) );
|
||||
|
||||
frag->base.des_remote = NULL;
|
||||
frag->base.des_remote_count = 0;
|
||||
frag->base.des_local = frag->segments;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous send.
|
||||
*
|
||||
@ -303,7 +355,7 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
|
||||
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
|
||||
frag->iov[0].iov_len = sizeof(frag->hdr);
|
||||
frag->hdr.size = 0;
|
||||
for( i = 0; i < (int)frag->base.des_segment_count; i++) {
|
||||
for( i = 0; i < (int)frag->base.des_local_count; i++) {
|
||||
frag->hdr.size += frag->segments[i].seg_len;
|
||||
frag->iov[i+1].iov_len = frag->segments[i].seg_len;
|
||||
frag->iov[i+1].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
|
||||
@ -316,55 +368,23 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl,
|
||||
return mca_btl_tcp_endpoint_send(endpoint,frag);
|
||||
}
|
||||
|
||||
static void fake_rdma_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
|
||||
mca_btl_base_descriptor_t *desc, int rc)
|
||||
{
|
||||
mca_btl_tcp_frag_t *frag = (mca_btl_tcp_frag_t *) desc;
|
||||
|
||||
frag->cb.func (btl, endpoint, frag->segments[0].seg_addr.pval, NULL, frag->cb.context, frag->cb.data,
|
||||
rc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
int mca_btl_tcp_put( mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_base_descriptor_t* descriptor )
|
||||
{
|
||||
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
|
||||
mca_btl_tcp_frag_t *frag = NULL;
|
||||
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor;
|
||||
int i;
|
||||
|
||||
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;;
|
||||
}
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
|
||||
frag->segments->seg_len = size;
|
||||
frag->segments->seg_addr.pval = local_address;
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
|
||||
frag->segments[0].seg_addr.pval = local_address;
|
||||
frag->segments[0].seg_len = size;
|
||||
|
||||
frag->segments[1].seg_addr.lval = remote_address;
|
||||
frag->segments[1].seg_len = size;
|
||||
|
||||
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.des_cbfunc = fake_rdma_complete;
|
||||
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.context = cbcontext;
|
||||
|
||||
frag->btl = tcp_btl;
|
||||
frag->endpoint = endpoint;
|
||||
frag->rc = 0;
|
||||
@ -374,9 +394,9 @@ int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t
|
||||
frag->iov_ptr = frag->iov;
|
||||
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
|
||||
frag->iov[0].iov_len = sizeof(frag->hdr);
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*) (frag->segments + 1);
|
||||
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
|
||||
for( i = 0; i < (int)frag->base.des_segment_count; i++ ) {
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote;
|
||||
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t);
|
||||
for( i = 0; i < (int)frag->base.des_local_count; i++ ) {
|
||||
frag->hdr.size += frag->segments[i].seg_len;
|
||||
frag->iov[i+2].iov_len = frag->segments[i].seg_len;
|
||||
frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
|
||||
@ -384,7 +404,7 @@ int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t
|
||||
}
|
||||
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
|
||||
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT;
|
||||
frag->hdr.count = 1;
|
||||
frag->hdr.count = frag->base.des_remote_count;
|
||||
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
|
||||
return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : i);
|
||||
}
|
||||
@ -392,46 +412,22 @@ int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
int mca_btl_tcp_get(
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_base_descriptor_t* descriptor)
|
||||
{
|
||||
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl;
|
||||
mca_btl_tcp_frag_t* frag = NULL;
|
||||
mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor;
|
||||
int rc;
|
||||
|
||||
MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;;
|
||||
}
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
|
||||
frag->segments->seg_len = size;
|
||||
frag->segments->seg_addr.pval = local_address;
|
||||
|
||||
frag->base.des_segments = frag->segments;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
|
||||
frag->segments[0].seg_addr.pval = local_address;
|
||||
frag->segments[0].seg_len = size;
|
||||
|
||||
frag->segments[1].seg_addr.lval = remote_address;
|
||||
frag->segments[1].seg_len = size;
|
||||
|
||||
/* call the rdma callback through the descriptor callback. this is
|
||||
* tcp so the extra latency is not an issue */
|
||||
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.des_cbfunc = fake_rdma_complete;
|
||||
|
||||
frag->cb.func = cbfunc;
|
||||
frag->cb.data = cbdata;
|
||||
frag->cb.context = cbcontext;
|
||||
|
||||
frag->btl = tcp_btl;
|
||||
frag->endpoint = endpoint;
|
||||
frag->rc = 0;
|
||||
@ -441,11 +437,11 @@ int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t
|
||||
frag->iov_ptr = frag->iov;
|
||||
frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr;
|
||||
frag->iov[0].iov_len = sizeof(frag->hdr);
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*) &frag->segments[1];
|
||||
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
|
||||
frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote;
|
||||
frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t);
|
||||
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
|
||||
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET;
|
||||
frag->hdr.count = 1;
|
||||
frag->hdr.count = frag->base.des_remote_count;
|
||||
if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr);
|
||||
return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : rc);
|
||||
}
|
||||
|
@ -217,22 +217,32 @@ extern int mca_btl_tcp_send(
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
extern int mca_btl_tcp_put(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
|
||||
int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
extern int mca_btl_tcp_get(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* decriptor
|
||||
);
|
||||
|
||||
/**
|
||||
* Allocate a descriptor with a segment of the requested size.
|
||||
@ -280,6 +290,7 @@ extern int mca_btl_tcp_free(
|
||||
mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
struct mca_mpool_base_registration_t*,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
@ -287,6 +298,16 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src(
|
||||
uint32_t flags
|
||||
);
|
||||
|
||||
extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
struct mca_mpool_base_registration_t*,
|
||||
struct opal_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size,
|
||||
uint32_t flags);
|
||||
|
||||
|
||||
/**
|
||||
* Fault Tolerance Event Notification Function
|
||||
|
@ -287,7 +287,7 @@ static int mca_btl_tcp_component_register(void)
|
||||
MCA_BTL_FLAGS_NEED_CSUM |
|
||||
MCA_BTL_FLAGS_NEED_ACK |
|
||||
MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
|
||||
|
||||
mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
|
||||
mca_btl_tcp_module.super.btl_bandwidth = 100;
|
||||
mca_btl_tcp_module.super.btl_latency = 100;
|
||||
|
||||
|
@ -58,12 +58,6 @@ struct mca_btl_tcp_frag_t {
|
||||
size_t size;
|
||||
int rc;
|
||||
ompi_free_list_t* my_list;
|
||||
/* fake rdma completion */
|
||||
struct {
|
||||
mca_btl_base_rdma_completion_fn_t func;
|
||||
void *data;
|
||||
void *context;
|
||||
} cb;
|
||||
};
|
||||
typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t);
|
||||
@ -122,8 +116,10 @@ do { \
|
||||
frag->iov_cnt = 1; \
|
||||
frag->iov_idx = 0; \
|
||||
frag->iov_ptr = frag->iov; \
|
||||
frag->base.des_segments = frag->segments; \
|
||||
frag->base.des_segment_count = 1; \
|
||||
frag->base.des_remote = NULL; \
|
||||
frag->base.des_remote_count = 0; \
|
||||
frag->base.des_local = frag->segments; \
|
||||
frag->base.des_local_count = 1; \
|
||||
} while(0)
|
||||
|
||||
|
||||
|
@ -270,8 +270,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_src(
|
||||
frag->segment.seg_len = max_data + reserve;
|
||||
}
|
||||
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
return &frag->base;
|
||||
}
|
||||
@ -311,8 +311,8 @@ mca_btl_base_descriptor_t* mca_btl_template_prepare_dst(
|
||||
frag->segment.seg_len = *size;
|
||||
opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
|
||||
|
||||
frag->base.des_segments = &frag->segment;
|
||||
frag->base.des_segment_count = 1;
|
||||
frag->base.des_local = &frag->segment;
|
||||
frag->base.des_local_count = 1;
|
||||
frag->base.des_flags = 0;
|
||||
return &frag->base;
|
||||
}
|
||||
|
@ -38,8 +38,7 @@ ugni_SOURCES = \
|
||||
btl_ugni.h \
|
||||
btl_ugni_smsg.h \
|
||||
btl_ugni_smsg.c \
|
||||
btl_ugni_prepare.h \
|
||||
btl_ugni_atomic.c
|
||||
btl_ugni_prepare.h
|
||||
|
||||
mcacomponentdir = $(opallibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
|
@ -33,7 +33,6 @@
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/class/ompi_free_list.h"
|
||||
#include "opal/class/opal_free_list.h"
|
||||
#include "opal/mca/common/ugni/common_ugni.h"
|
||||
|
||||
#include <errno.h>
|
||||
@ -81,11 +80,6 @@ typedef struct mca_btl_ugni_module_t {
|
||||
opal_mutex_t eager_get_pending_lock;
|
||||
opal_list_t eager_get_pending;
|
||||
|
||||
opal_mutex_t pending_descriptors_lock;
|
||||
opal_list_t pending_descriptors;
|
||||
|
||||
ompi_free_list_t post_descriptors;
|
||||
|
||||
mca_mpool_base_module_t *smsg_mpool;
|
||||
ompi_free_list_t smsg_mboxes;
|
||||
|
||||
@ -149,6 +143,8 @@ typedef struct mca_btl_ugni_component_t {
|
||||
|
||||
/* After this message size switch to BTE protocols */
|
||||
size_t ugni_fma_limit;
|
||||
/* Switch to put when trying to GET at or above this size */
|
||||
size_t ugni_get_limit;
|
||||
/* Switch to get when sending above this size */
|
||||
size_t ugni_smsg_limit;
|
||||
|
||||
@ -264,31 +260,33 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
|
||||
uint32_t flags, mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t **descriptor);
|
||||
|
||||
int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
/**
|
||||
* Initiate a get operation.
|
||||
*
|
||||
* location: btl_ugni_get.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int
|
||||
mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
|
||||
int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value,
|
||||
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
*
|
||||
* location: btl_ugni_put.c
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param descriptor (IN) Description of the data to be transferred
|
||||
*/
|
||||
int
|
||||
mca_btl_ugni_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *des);
|
||||
|
||||
int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
@ -297,14 +295,9 @@ mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint8_t order, size_t size, uint32_t flags);
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
/** uGNI memory handle */
|
||||
gni_mem_handle_t gni_handle;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_ugni_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
mca_btl_base_registration_handle_t handle;
|
||||
gni_mem_handle_t memory_hdl;
|
||||
} mca_btl_ugni_reg_t;
|
||||
|
||||
/* Global structures */
|
||||
|
@ -34,6 +34,7 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t **peers,
|
||||
opal_bitmap_t *reachable) {
|
||||
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
|
||||
opal_proc_t *my_proc = opal_proc_local_get();
|
||||
size_t i;
|
||||
int rc;
|
||||
|
||||
@ -65,8 +66,11 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
|
||||
if (OPAL_PROC_ON_LOCAL_NODE(ompi_proc->proc_flags)) {
|
||||
ugni_module->nlocal_procs++;
|
||||
|
||||
/* ugni is allowed on local processes to provide support for network
|
||||
* atomic operations */
|
||||
/* Do not use uGNI to communicate with local procs unless we are adding more ranks.
|
||||
* Change this when sm and vader are updated to handle additional add procs. */
|
||||
if (!ugni_module->initialized || my_proc == ompi_proc) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Create and Init endpoints */
|
||||
@ -184,7 +188,7 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
|
||||
size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
|
||||
-1, &(ugni_reg->handle.gni_handle));
|
||||
-1, &(ugni_reg->memory_hdl));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) {
|
||||
@ -207,7 +211,7 @@ static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size,
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base,
|
||||
size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1,
|
||||
&(ugni_reg->handle.gni_handle));
|
||||
&(ugni_reg->memory_hdl));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
@ -220,7 +224,7 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
|
||||
gni_return_t rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->handle.gni_handle);
|
||||
rc = GNI_MemDeregister (ugni_module->device->dev_handle, &ugni_reg->memory_hdl);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
return OPAL_ERROR;
|
||||
@ -397,15 +401,6 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = ompi_free_list_init_new (&ugni_module->post_descriptors,
|
||||
sizeof (mca_btl_ugni_post_descriptor_t),
|
||||
8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t),
|
||||
0, 0, 0, -1, 256, NULL);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
BTL_ERROR(("error creating post descriptor free list"));
|
||||
return rc;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,135 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ugni_rdma.h"
|
||||
|
||||
static gni_fma_cmd_type_t famo_cmds[] = {
|
||||
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD,
|
||||
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND,
|
||||
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR,
|
||||
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR,
|
||||
};
|
||||
|
||||
static gni_fma_cmd_type_t amo_cmds[] = {
|
||||
[MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD,
|
||||
[MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND,
|
||||
[MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR,
|
||||
[MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR,
|
||||
};
|
||||
|
||||
int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
gni_mem_handle_t dummy = {0, 0};
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
|
||||
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address,
|
||||
remote_handle->gni_handle, 8, 0);
|
||||
post_desc->desc.base.amo_cmd = amo_cmds[op];
|
||||
|
||||
post_desc->desc.base.first_operand = operand;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
|
||||
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, 8, 0);
|
||||
post_desc->desc.base.amo_cmd = famo_cmds[op];
|
||||
|
||||
post_desc->desc.base.first_operand = operand;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc;
|
||||
int rc;
|
||||
|
||||
rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata, &post_desc);
|
||||
if (OPAL_UNLIKELY(NULL == post_desc)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
|
||||
remote_address, remote_handle->gni_handle, 8, 0);
|
||||
post_desc->desc.base.amo_cmd = GNI_FMA_ATOMIC_CSWAP;
|
||||
|
||||
post_desc->desc.base.first_operand = compare;
|
||||
post_desc->desc.base.second_operand = value;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock);
|
||||
rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock);
|
||||
if (GNI_RC_SUCCESS != rc) {
|
||||
mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc);
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
@ -52,7 +52,6 @@ static int
|
||||
btl_ugni_component_register(void)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
gni_nic_device_t device_type;
|
||||
int rc;
|
||||
|
||||
(void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version,
|
||||
@ -140,6 +139,15 @@ btl_ugni_component_register(void)
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_fma_limit);
|
||||
|
||||
mca_btl_ugni_component.ugni_get_limit = 1 * 1024 * 1024;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"get_limit", "Maximum size message that "
|
||||
"will be sent using a get protocol "
|
||||
"(default 1M)", MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_ugni_component.ugni_get_limit);
|
||||
|
||||
mca_btl_ugni_component.rdma_max_retries = 16;
|
||||
(void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version,
|
||||
"rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT,
|
||||
@ -204,28 +212,13 @@ btl_ugni_component_register(void)
|
||||
mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024;
|
||||
mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024;
|
||||
|
||||
mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024;
|
||||
|
||||
/* determine if there are get alignment restrictions */
|
||||
GNI_GetDeviceType (&device_type);
|
||||
|
||||
if (GNI_DEVICE_GEMINI == device_type) {
|
||||
mca_btl_ugni_module.super.btl_get_alignment = 4;
|
||||
} else {
|
||||
mca_btl_ugni_module.super.btl_get_alignment = 0;
|
||||
}
|
||||
|
||||
/* threshold for put */
|
||||
mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024;
|
||||
|
||||
mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
|
||||
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS |
|
||||
MCA_BTL_FLAGS_ATOMIC_FOPS;
|
||||
mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
|
||||
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
|
||||
mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
|
||||
mca_btl_ugni_module.super.btl_seg_size = sizeof (mca_btl_ugni_segment_t);
|
||||
|
||||
mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */
|
||||
mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */
|
||||
@ -432,107 +425,89 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
|
||||
return count;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
static inline void btl_ugni_dump_post_desc (mca_btl_ugni_post_descriptor_t *desc)
|
||||
static inline int
|
||||
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
|
||||
{
|
||||
|
||||
fprintf (stderr, "desc->desc.base.post_id = %" PRIx64 "\n", desc->desc.base.post_id);
|
||||
fprintf (stderr, "desc->desc.base.status = %" PRIx64 "\n", desc->desc.base.status);
|
||||
fprintf (stderr, "desc->desc.base.cq_mode_complete = %hu\n", desc->desc.base.cq_mode_complete);
|
||||
fprintf (stderr, "desc->desc.base.type = %d\n", desc->desc.base.type);
|
||||
fprintf (stderr, "desc->desc.base.cq_mode = %hu\n", desc->desc.base.cq_mode);
|
||||
fprintf (stderr, "desc->desc.base.dlvr_mode = %hu\n", desc->desc.base.dlvr_mode);
|
||||
fprintf (stderr, "desc->desc.base.local_addr = %" PRIx64 "\n", desc->desc.base.local_addr);
|
||||
fprintf (stderr, "desc->desc.base.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.local_mem_hndl.qword1,
|
||||
desc->desc.base.local_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->desc.base.remote_addr = %" PRIx64 "\n", desc->desc.base.remote_addr);
|
||||
fprintf (stderr, "desc->desc.base.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->desc.base.remote_mem_hndl.qword1,
|
||||
desc->desc.base.remote_mem_hndl.qword2);
|
||||
fprintf (stderr, "desc->desc.base.length = %" PRIu64 "\n", desc->desc.base.length);
|
||||
fprintf (stderr, "desc->desc.base.rdma_mode = %hu\n", desc->desc.base.rdma_mode);
|
||||
fprintf (stderr, "desc->desc.base.amo_cmd = %d\n", desc->desc.base.amo_cmd);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
|
||||
{
|
||||
mca_btl_ugni_post_descriptor_t *post_desc = NULL;
|
||||
opal_common_ugni_post_desc_t *desc;
|
||||
mca_btl_ugni_base_frag_t *frag;
|
||||
gni_cq_entry_t event_data = 0;
|
||||
gni_post_descriptor_t *desc;
|
||||
uint32_t recoverable = 1;
|
||||
gni_return_t grc;
|
||||
gni_return_t rc;
|
||||
gni_cq_handle_t the_cq;
|
||||
|
||||
the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
|
||||
grc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data);
|
||||
if (GNI_RC_NOT_DONE == grc) {
|
||||
rc = GNI_CqGetEvent (the_cq, &event_data);
|
||||
if (GNI_RC_NOT_DONE == rc) {
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
|
||||
if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
|
||||
/* TODO -- need to handle overrun -- how do we do this without an event?
|
||||
will the event eventually come back? Ask Cray */
|
||||
BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc]));
|
||||
BTL_ERROR(("unhandled post error! ugni rc = %d %s", rc,gni_err_str[rc]));
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
grc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, &desc);
|
||||
rc = GNI_GetCompleted (the_cq, event_data, (gni_post_descriptor_t **) &desc);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) {
|
||||
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
|
||||
return opal_common_rc_ugni_to_opal (grc);
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
|
||||
BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
|
||||
return opal_common_rc_ugni_to_opal (rc);
|
||||
}
|
||||
|
||||
post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc);
|
||||
frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
|
||||
char buffer[1024];
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) {
|
||||
(void) GNI_CqErrorRecoverable (event_data, &recoverable);
|
||||
GNI_CqErrorStr(event_data,buffer,sizeof(buffer));
|
||||
|
||||
if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
|
||||
if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
|
||||
!recoverable)) {
|
||||
char char_buffer[1024];
|
||||
GNI_CqErrorStr (event_data, char_buffer, 1024);
|
||||
/* give up */
|
||||
BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc,
|
||||
recoverable, char_buffer));
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
btl_ugni_dump_post_desc (post_desc);
|
||||
#endif
|
||||
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR);
|
||||
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
|
||||
mca_btl_ugni_frag_complete (frag, OPAL_ERROR);
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
mca_btl_ugni_repost (ugni_module, post_desc);
|
||||
/* repost transaction */
|
||||
mca_btl_ugni_repost (frag);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc));
|
||||
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
|
||||
|
||||
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int
|
||||
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module)
|
||||
mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
|
||||
{
|
||||
int count = opal_list_get_size (&ugni_module->pending_descriptors);
|
||||
int count = opal_list_get_size (&ugni_module->failed_frags);
|
||||
int i;
|
||||
|
||||
for (i = 0 ; i < count ; ++i) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock);
|
||||
mca_btl_ugni_post_descriptor_t *post_desc =
|
||||
(mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock);
|
||||
|
||||
if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->failed_frags_lock);
|
||||
mca_btl_ugni_base_frag_t *frag =
|
||||
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
|
||||
if (NULL == frag) {
|
||||
break;
|
||||
}
|
||||
|
||||
mca_btl_ugni_repost (frag);
|
||||
}
|
||||
|
||||
return i;
|
||||
return count;
|
||||
}
|
||||
|
||||
static inline int
|
||||
@ -582,6 +557,7 @@ static int mca_btl_ugni_component_progress (void)
|
||||
for (i = 0 ; i < mca_btl_ugni_component.ugni_num_btls ; ++i) {
|
||||
ugni_module = mca_btl_ugni_component.modules + i;
|
||||
|
||||
mca_btl_ugni_retry_failed (ugni_module);
|
||||
mca_btl_ugni_progress_wait_list (ugni_module);
|
||||
|
||||
count += mca_btl_ugni_progress_datagram (ugni_module);
|
||||
@ -589,8 +565,6 @@ static int mca_btl_ugni_component_progress (void)
|
||||
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
|
||||
count += mca_btl_ugni_progress_rdma (ugni_module, 0);
|
||||
|
||||
/* post pending after progressing rdma */
|
||||
mca_btl_ugni_post_pending (ugni_module);
|
||||
}
|
||||
|
||||
return count;
|
||||
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
x
Ссылка в новой задаче
Block a user